From 1832617f950b8d4648dbe63dfb7fa8d268343395 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 11 Aug 2023 05:25:02 -0400 Subject: [PATCH 001/126] initial build --- pandas/_libs/arrays.pyx | 45 +- .../_libs/include/pandas/vendored/nanoarrow.h | 3371 +++++++++++++++++ pandas/_libs/meson.build | 2 +- pandas/core/arrays/masked.py | 100 +- 4 files changed, 3473 insertions(+), 45 deletions(-) create mode 100644 pandas/_libs/include/pandas/vendored/nanoarrow.h diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 718fb358e26bc..4b7c86a067fa5 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -7,10 +7,26 @@ import numpy as np cimport numpy as cnp from cpython cimport PyErr_Clear -from numpy cimport ndarray +from numpy cimport ( + int8_t, + int64_t, + ndarray, + uint8_t, +) cnp.import_array() +from libc.stdlib cimport ( + free, + malloc, +) + + +cdef extern from "pandas/vendored/nanoarrow.h": + int8_t ArrowBitGet(const uint8_t*, int64_t) + void ArrowBitSet(uint8_t*, int64_t) + void ArrowBitClear(uint8_t*, int64_t) + @cython.freelist(16) cdef class NDArrayBacked: @@ -189,3 +205,30 @@ cdef class NDArrayBacked: new_values = [obj._ndarray for obj in to_concat] new_arr = cnp.PyArray_Concatenate(new_values, axis) return to_concat[0]._from_backing_data(new_arr) + + +cdef class BitMaskArray: + cdef array_len + cdef uint8_t* validity_buffer + + def __cinit__(self, np_array): + self.array_len = len(np_array) + nbytes = len(np_array) // 8 + 1 + self.validity_buffer = malloc(nbytes) + # malloc + + def __dealloc__(self): + ... + free(self.validity_buffer) + + def __setitem__(self, key, value): + if value: + ArrowBitSet(self.validity_buffer, key) + else: + ArrowBitClear(self.validity_buffer, key) + + def __getitem__(self, key): + bool(ArrowBitGet(self.validity_buffer, key)) + + def to_numpy(self): + ... diff --git a/pandas/_libs/include/pandas/vendored/nanoarrow.h b/pandas/_libs/include/pandas/vendored/nanoarrow.h new file mode 100644 index 0000000000000..666dea1448326 --- /dev/null +++ b/pandas/_libs/include/pandas/vendored/nanoarrow.h @@ -0,0 +1,3371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_BUILD_ID_H_INCLUDED +#define NANOARROW_BUILD_ID_H_INCLUDED + +#define NANOARROW_VERSION_MAJOR 0 +#define NANOARROW_VERSION_MINOR 3 +#define NANOARROW_VERSION_PATCH 0 +#define NANOARROW_VERSION "0.3.0-SNAPSHOT" + +#define NANOARROW_VERSION_INT \ + (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ + NANOARROW_VERSION_PATCH) + +// #define NANOARROW_NAMESPACE YourNamespaceHere + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_NANOARROW_TYPES_H_INCLUDED +#define NANOARROW_NANOARROW_TYPES_H_INCLUDED + +#include +#include + + + +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// Extra guard for versions of Arrow without the canonical guard +#ifndef ARROW_FLAG_DICTIONARY_ORDERED + +/// \defgroup nanoarrow-arrow-cdata Arrow C Data interface +/// +/// The Arrow C Data (https://arrow.apache.org/docs/format/CDataInterface.html) +/// and Arrow C Stream (https://arrow.apache.org/docs/format/CStreamInterface.html) +/// interfaces are part of the +/// Arrow Columnar Format specification +/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for +/// documentation of these structures. +/// +/// @{ + +#ifndef ARROW_C_DATA_INTERFACE +#define ARROW_C_DATA_INTERFACE + +#define ARROW_FLAG_DICTIONARY_ORDERED 1 +#define ARROW_FLAG_NULLABLE 2 +#define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DATA_INTERFACE + +#ifndef ARROW_C_STREAM_INTERFACE +#define ARROW_C_STREAM_INTERFACE + +struct ArrowArrayStream { + // Callback to get the stream type + // (will be the same for all arrays in the stream). + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowArray must be released independently from the stream. + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_STREAM_INTERFACE +#endif // ARROW_FLAG_DICTIONARY_ORDERED + +/// \brief Move the contents of src into dst and set src->release to NULL +static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst) { + memcpy(dst, src, sizeof(struct ArrowSchema)); + src->release = NULL; +} + +/// \brief Move the contents of src into dst and set src->release to NULL +static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst) { + memcpy(dst, src, sizeof(struct ArrowArray)); + src->release = NULL; +} + +/// \brief Move the contents of src into dst and set src->release to NULL +static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, + struct ArrowArrayStream* dst) { + memcpy(dst, src, sizeof(struct ArrowArrayStream)); + src->release = NULL; +} + +/// @} + +// Utility macros +#define _NANOARROW_CONCAT(x, y) x##y +#define _NANOARROW_MAKE_NAME(x, y) _NANOARROW_CONCAT(x, y) + +#define _NANOARROW_RETURN_NOT_OK_IMPL(NAME, EXPR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) return NAME; \ + } while (0) + +#define _NANOARROW_CHECK_RANGE(x_, min_, max_) \ + NANOARROW_RETURN_NOT_OK((x_ >= min_ && x_ <= max_) ? NANOARROW_OK : EINVAL) + +#define _NANOARROW_CHECK_UPPER_LIMIT(x_, max_) \ + NANOARROW_RETURN_NOT_OK((x_ <= max_) ? NANOARROW_OK : EINVAL) + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d\n* %s:%d", EXPR_STR, \ + NAME, __FILE__, __LINE__); \ + return NAME; \ + } \ + } while (0) +#else +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d", EXPR_STR, NAME); \ + return NAME; \ + } \ + } while (0) +#endif + +/// \brief Return code for success. +/// \ingroup nanoarrow-errors +#define NANOARROW_OK 0 + +/// \brief Represents an errno-compatible error code +/// \ingroup nanoarrow-errors +typedef int ArrowErrorCode; + +/// \brief Check the result of an expression and return it if not NANOARROW_OK +/// \ingroup nanoarrow-errors +#define NANOARROW_RETURN_NOT_OK(EXPR) \ + _NANOARROW_RETURN_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR) + +/// \brief Check the result of an expression and return it if not NANOARROW_OK, +/// adding an auto-generated message to an ArrowError. +/// \ingroup nanoarrow-errors +/// +/// This macro is used to ensure that functions that accept an ArrowError +/// as input always set its message when returning an error code (e.g., when calling +/// a nanoarrow function that does *not* accept ArrowError). +#define NANOARROW_RETURN_NOT_OK_WITH_ERROR(EXPR, ERROR_EXPR) \ + _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ + _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, ERROR_EXPR, #EXPR) + +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#define NANOARROW_PRINT_AND_DIE(VALUE, EXPR_STR) \ + do { \ + fprintf(stderr, "%s failed with errno %d\n* %s:%d\n", EXPR_STR, (int)(VALUE), \ + __FILE__, (int)__LINE__); \ + abort(); \ + } while (0) +#endif + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_ASSERT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) NANOARROW_PRINT_AND_DIE(NAME, EXPR_STR); \ + } while (0) + +/// \brief Assert that an expression's value is NANOARROW_OK +/// \ingroup nanoarrow-errors +/// +/// If nanoarrow was built in debug mode (i.e., defined(NANOARROW_DEBUG) is true), +/// print a message to stderr and abort. If nanoarrow was bulit in release mode, +/// this statement has no effect. You can customize fatal error behaviour +/// be defining the NANOARROW_PRINT_AND_DIE macro before including nanoarrow.h +/// This macro is provided as a convenience for users and is not used internally. +#define NANOARROW_ASSERT_OK(EXPR) \ + _NANOARROW_ASSERT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, #EXPR) +#else +#define NANOARROW_ASSERT_OK(EXPR) EXPR +#endif + +static char _ArrowIsLittleEndian(void) { + uint32_t check = 1; + char first_byte; + memcpy(&first_byte, &check, sizeof(char)); + return first_byte; +} + +/// \brief Arrow type enumerator +/// \ingroup nanoarrow-utils +/// +/// These names are intended to map to the corresponding arrow::Type::type +/// enumerator; however, the numeric values are specifically not equal +/// (i.e., do not rely on numeric comparison). +enum ArrowType { + NANOARROW_TYPE_UNINITIALIZED = 0, + NANOARROW_TYPE_NA = 1, + NANOARROW_TYPE_BOOL, + NANOARROW_TYPE_UINT8, + NANOARROW_TYPE_INT8, + NANOARROW_TYPE_UINT16, + NANOARROW_TYPE_INT16, + NANOARROW_TYPE_UINT32, + NANOARROW_TYPE_INT32, + NANOARROW_TYPE_UINT64, + NANOARROW_TYPE_INT64, + NANOARROW_TYPE_HALF_FLOAT, + NANOARROW_TYPE_FLOAT, + NANOARROW_TYPE_DOUBLE, + NANOARROW_TYPE_STRING, + NANOARROW_TYPE_BINARY, + NANOARROW_TYPE_FIXED_SIZE_BINARY, + NANOARROW_TYPE_DATE32, + NANOARROW_TYPE_DATE64, + NANOARROW_TYPE_TIMESTAMP, + NANOARROW_TYPE_TIME32, + NANOARROW_TYPE_TIME64, + NANOARROW_TYPE_INTERVAL_MONTHS, + NANOARROW_TYPE_INTERVAL_DAY_TIME, + NANOARROW_TYPE_DECIMAL128, + NANOARROW_TYPE_DECIMAL256, + NANOARROW_TYPE_LIST, + NANOARROW_TYPE_STRUCT, + NANOARROW_TYPE_SPARSE_UNION, + NANOARROW_TYPE_DENSE_UNION, + NANOARROW_TYPE_DICTIONARY, + NANOARROW_TYPE_MAP, + NANOARROW_TYPE_EXTENSION, + NANOARROW_TYPE_FIXED_SIZE_LIST, + NANOARROW_TYPE_DURATION, + NANOARROW_TYPE_LARGE_STRING, + NANOARROW_TYPE_LARGE_BINARY, + NANOARROW_TYPE_LARGE_LIST, + NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO +}; + +/// \brief Get a string value of an enum ArrowType value +/// \ingroup nanoarrow-utils +/// +/// Returns NULL for invalid values for type +static inline const char* ArrowTypeString(enum ArrowType type); + +static inline const char* ArrowTypeString(enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_NA: + return "na"; + case NANOARROW_TYPE_BOOL: + return "bool"; + case NANOARROW_TYPE_UINT8: + return "uint8"; + case NANOARROW_TYPE_INT8: + return "int8"; + case NANOARROW_TYPE_UINT16: + return "uint16"; + case NANOARROW_TYPE_INT16: + return "int16"; + case NANOARROW_TYPE_UINT32: + return "uint32"; + case NANOARROW_TYPE_INT32: + return "int32"; + case NANOARROW_TYPE_UINT64: + return "uint64"; + case NANOARROW_TYPE_INT64: + return "int64"; + case NANOARROW_TYPE_HALF_FLOAT: + return "half_float"; + case NANOARROW_TYPE_FLOAT: + return "float"; + case NANOARROW_TYPE_DOUBLE: + return "double"; + case NANOARROW_TYPE_STRING: + return "string"; + case NANOARROW_TYPE_BINARY: + return "binary"; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + return "fixed_size_binary"; + case NANOARROW_TYPE_DATE32: + return "date32"; + case NANOARROW_TYPE_DATE64: + return "date64"; + case NANOARROW_TYPE_TIMESTAMP: + return "timestamp"; + case NANOARROW_TYPE_TIME32: + return "time32"; + case NANOARROW_TYPE_TIME64: + return "time64"; + case NANOARROW_TYPE_INTERVAL_MONTHS: + return "interval_months"; + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + return "interval_day_time"; + case NANOARROW_TYPE_DECIMAL128: + return "decimal128"; + case NANOARROW_TYPE_DECIMAL256: + return "decimal256"; + case NANOARROW_TYPE_LIST: + return "list"; + case NANOARROW_TYPE_STRUCT: + return "struct"; + case NANOARROW_TYPE_SPARSE_UNION: + return "sparse_union"; + case NANOARROW_TYPE_DENSE_UNION: + return "dense_union"; + case NANOARROW_TYPE_DICTIONARY: + return "dictionary"; + case NANOARROW_TYPE_MAP: + return "map"; + case NANOARROW_TYPE_EXTENSION: + return "extension"; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return "fixed_size_list"; + case NANOARROW_TYPE_DURATION: + return "duration"; + case NANOARROW_TYPE_LARGE_STRING: + return "large_string"; + case NANOARROW_TYPE_LARGE_BINARY: + return "large_binary"; + case NANOARROW_TYPE_LARGE_LIST: + return "large_list"; + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + return "interval_month_day_nano"; + default: + return NULL; + } +} + +/// \brief Arrow time unit enumerator +/// \ingroup nanoarrow-utils +/// +/// These names and values map to the corresponding arrow::TimeUnit::type +/// enumerator. +enum ArrowTimeUnit { + NANOARROW_TIME_UNIT_SECOND = 0, + NANOARROW_TIME_UNIT_MILLI = 1, + NANOARROW_TIME_UNIT_MICRO = 2, + NANOARROW_TIME_UNIT_NANO = 3 +}; + +/// \brief Validation level enumerator +/// \ingroup nanoarrow-array +enum ArrowValidationLevel { + /// \brief Do not validate buffer sizes or content. + NANOARROW_VALIDATION_LEVEL_NONE = 0, + + /// \brief Validate buffer sizes that depend on array length but do not validate buffer + /// sizes that depend on buffer data access. + NANOARROW_VALIDATION_LEVEL_MINIMAL = 1, + + /// \brief Validate all buffer sizes, including those that require buffer data access, + /// but do not perform any checks that are O(1) along the length of the buffers. + NANOARROW_VALIDATION_LEVEL_DEFAULT = 2, + + /// \brief Validate all buffer sizes and all buffer content. This is useful in the + /// context of untrusted input or input that may have been corrupted in transit. + NANOARROW_VALIDATION_LEVEL_FULL = 3 +}; + +/// \brief Get a string value of an enum ArrowTimeUnit value +/// \ingroup nanoarrow-utils +/// +/// Returns NULL for invalid values for time_unit +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit); + +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + return "s"; + case NANOARROW_TIME_UNIT_MILLI: + return "ms"; + case NANOARROW_TIME_UNIT_MICRO: + return "us"; + case NANOARROW_TIME_UNIT_NANO: + return "ns"; + default: + return NULL; + } +} + +/// \brief Functional types of buffers as described in the Arrow Columnar Specification +/// \ingroup nanoarrow-array-view +enum ArrowBufferType { + NANOARROW_BUFFER_TYPE_NONE, + NANOARROW_BUFFER_TYPE_VALIDITY, + NANOARROW_BUFFER_TYPE_TYPE_ID, + NANOARROW_BUFFER_TYPE_UNION_OFFSET, + NANOARROW_BUFFER_TYPE_DATA_OFFSET, + NANOARROW_BUFFER_TYPE_DATA +}; + +/// \brief An non-owning view of a string +/// \ingroup nanoarrow-utils +struct ArrowStringView { + /// \brief A pointer to the start of the string + /// + /// If size_bytes is 0, this value may be NULL. + const char* data; + + /// \brief The size of the string in bytes, + /// + /// (Not including the null terminator.) + int64_t size_bytes; +}; + +/// \brief Return a view of a const C string +/// \ingroup nanoarrow-utils +static inline struct ArrowStringView ArrowCharView(const char* value); + +static inline struct ArrowStringView ArrowCharView(const char* value) { + struct ArrowStringView out; + + out.data = value; + if (value) { + out.size_bytes = (int64_t)strlen(value); + } else { + out.size_bytes = 0; + } + + return out; +} + +union ArrowBufferViewData { + const void* data; + const int8_t* as_int8; + const uint8_t* as_uint8; + const int16_t* as_int16; + const uint16_t* as_uint16; + const int32_t* as_int32; + const uint32_t* as_uint32; + const int64_t* as_int64; + const uint64_t* as_uint64; + const double* as_double; + const float* as_float; + const char* as_char; +}; + +/// \brief An non-owning view of a buffer +/// \ingroup nanoarrow-utils +struct ArrowBufferView { + /// \brief A pointer to the start of the buffer + /// + /// If size_bytes is 0, this value may be NULL. + union ArrowBufferViewData data; + + /// \brief The size of the buffer in bytes + int64_t size_bytes; +}; + +/// \brief Array buffer allocation and deallocation +/// \ingroup nanoarrow-buffer +/// +/// Container for allocate, reallocate, and free methods that can be used +/// to customize allocation and deallocation of buffers when constructing +/// an ArrowArray. +struct ArrowBufferAllocator { + /// \brief Reallocate a buffer or return NULL if it cannot be reallocated + uint8_t* (*reallocate)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, + int64_t old_size, int64_t new_size); + + /// \brief Deallocate a buffer allocated by this allocator + void (*free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size); + + /// \brief Opaque data specific to the allocator + void* private_data; +}; + +/// \brief An owning mutable view of a buffer +/// \ingroup nanoarrow-buffer +struct ArrowBuffer { + /// \brief A pointer to the start of the buffer + /// + /// If capacity_bytes is 0, this value may be NULL. + uint8_t* data; + + /// \brief The size of the buffer in bytes + int64_t size_bytes; + + /// \brief The capacity of the buffer in bytes + int64_t capacity_bytes; + + /// \brief The allocator that will be used to reallocate and/or free the buffer + struct ArrowBufferAllocator allocator; +}; + +/// \brief An owning mutable view of a bitmap +/// \ingroup nanoarrow-bitmap +struct ArrowBitmap { + /// \brief An ArrowBuffer to hold the allocated memory + struct ArrowBuffer buffer; + + /// \brief The number of bits that have been appended to the bitmap + int64_t size_bits; +}; + +/// \brief A description of an arrangement of buffers +/// \ingroup nanoarrow-utils +/// +/// Contains the minimum amount of information required to +/// calculate the size of each buffer in an ArrowArray knowing only +/// the length and offset of the array. +struct ArrowLayout { + /// \brief The function of each buffer + enum ArrowBufferType buffer_type[3]; + + /// \brief The data type of each buffer + enum ArrowType buffer_data_type[3]; + + /// \brief The size of an element each buffer or 0 if this size is variable or unknown + int64_t element_size_bits[3]; + + /// \brief The number of elements in the child array per element in this array for a + /// fixed-size list + int64_t child_size_elements; +}; + +/// \brief A non-owning view of an ArrowArray +/// \ingroup nanoarrow-array-view +/// +/// This data structure provides access to the values contained within +/// an ArrowArray with fields provided in a more readily-extractible +/// form. You can re-use an ArrowArrayView for multiple ArrowArrays +/// with the same storage type, use it to represent a hypothetical +/// ArrowArray that does not exist yet, or use it to validate the buffers +/// of a future ArrowArray. +struct ArrowArrayView { + /// \brief The underlying ArrowArray or NULL if it has not been set or + /// if the buffers in this ArrowArrayView are not backed by an ArrowArray. + struct ArrowArray* array; + + /// \brief The number of elements from the physical start of the buffers. + int64_t offset; + + /// \brief The number of elements in this view. + int64_t length; + + /// \brief A cached null count or -1 to indicate that this value is unknown. + int64_t null_count; + + /// \brief The type used to store values in this array + /// + /// This type represents only the minimum required information to + /// extract values from the array buffers (e.g., for a Date32 array, + /// this value will be NANOARROW_TYPE_INT32). For dictionary-encoded + /// arrays, this will be the index type. + enum ArrowType storage_type; + + /// \brief The buffer types, strides, and sizes of this Array's buffers + struct ArrowLayout layout; + + /// \brief This Array's buffers as ArrowBufferView objects + struct ArrowBufferView buffer_views[3]; + + /// \brief The number of children of this view + int64_t n_children; + + /// \brief Pointers to views of this array's children + struct ArrowArrayView** children; + + /// \brief Pointer to a view of this array's dictionary + struct ArrowArrayView* dictionary; + + /// \brief Union type id to child index mapping + /// + /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer + /// such that child_index == union_type_id_map[type_id] and + /// type_id == union_type_id_map[128 + child_index]. This value may be + /// NULL in the case where child_id == type_id. + int8_t* union_type_id_map; +}; + +// Used as the private data member for ArrowArrays allocated here and accessed +// internally within inline ArrowArray* helpers. +struct ArrowArrayPrivateData { + // Holder for the validity buffer (or first buffer for union types, which are + // the only type whose first buffer is not a valdiity buffer) + struct ArrowBitmap bitmap; + + // Holder for additional buffers as required + struct ArrowBuffer buffers[2]; + + // The array of pointers to buffers. This must be updated after a sequence + // of appends to synchronize its values with the actual buffer addresses + // (which may have ben reallocated uring that time) + const void* buffer_data[3]; + + // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown + enum ArrowType storage_type; + + // The buffer arrangement for the storage type + struct ArrowLayout layout; + + // Flag to indicate if there are non-sequence union type ids. + // In the future this could be replaced with a type id<->child mapping + // to support constructing unions in append mode where type_id != child_index + int8_t union_type_id_is_child_index; +}; + +/// \brief A representation of an interval. +/// \ingroup nanoarrow-utils +struct ArrowInterval { + /// \brief The type of interval being used + enum ArrowType type; + /// \brief The number of months represented by the interval + int32_t months; + /// \brief The number of days represented by the interval + int32_t days; + /// \brief The number of ms represented by the interval + int32_t ms; + /// \brief The number of ns represented by the interval + int64_t ns; +}; + +/// \brief Zero initialize an Interval with a given unit +/// \ingroup nanoarrow-utils +static inline void ArrowIntervalInit(struct ArrowInterval* interval, + enum ArrowType type) { + memset(interval, 0, sizeof(struct ArrowInterval)); + interval->type = type; +} + +/// \brief A representation of a fixed-precision decimal number +/// \ingroup nanoarrow-utils +/// +/// This structure should be initialized with ArrowDecimalInit() once and +/// values set using ArrowDecimalSetInt(), ArrowDecimalSetBytes128(), +/// or ArrowDecimalSetBytes256(). +struct ArrowDecimal { + /// \brief An array of 64-bit integers of n_words length defined in native-endian order + uint64_t words[4]; + + /// \brief The number of significant digits this decimal number can represent + int32_t precision; + + /// \brief The number of digits after the decimal point. This can be negative. + int32_t scale; + + /// \brief The number of words in the words array + int n_words; + + /// \brief Cached value used by the implementation + int high_word_index; + + /// \brief Cached value used by the implementation + int low_word_index; +}; + +/// \brief Initialize a decimal with a given set of type parameters +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalInit(struct ArrowDecimal* decimal, int32_t bitwidth, + int32_t precision, int32_t scale) { + memset(decimal->words, 0, sizeof(decimal->words)); + decimal->precision = precision; + decimal->scale = scale; + decimal->n_words = bitwidth / 8 / sizeof(uint64_t); + + if (_ArrowIsLittleEndian()) { + decimal->low_word_index = 0; + decimal->high_word_index = decimal->n_words - 1; + } else { + decimal->low_word_index = decimal->n_words - 1; + decimal->high_word_index = 0; + } +} + +/// \brief Get a signed integer value of a sufficiently small ArrowDecimal +/// +/// This does not check if the decimal's precision sufficiently small to fit +/// within the signed 64-bit integer range (A precision less than or equal +/// to 18 is sufficiently small). +static inline int64_t ArrowDecimalGetIntUnsafe(struct ArrowDecimal* decimal) { + return (int64_t)decimal->words[decimal->low_word_index]; +} + +/// \brief Copy the bytes of this decimal into a sufficiently large buffer +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalGetBytes(struct ArrowDecimal* decimal, uint8_t* out) { + memcpy(out, decimal->words, decimal->n_words * sizeof(uint64_t)); +} + +/// \brief Returns 1 if the value represented by decimal is >= 0 or -1 otherwise +/// \ingroup nanoarrow-utils +static inline int64_t ArrowDecimalSign(struct ArrowDecimal* decimal) { + return 1 | ((int64_t)(decimal->words[decimal->high_word_index]) >> 63); +} + +/// \brief Sets the integer value of this decimal +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalSetInt(struct ArrowDecimal* decimal, int64_t value) { + if (value < 0) { + memset(decimal->words, 0xff, decimal->n_words * sizeof(uint64_t)); + } else { + memset(decimal->words, 0, decimal->n_words * sizeof(uint64_t)); + } + + decimal->words[decimal->low_word_index] = value; +} + +/// \brief Copy bytes from a buffer into this decimal +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, + const uint8_t* value) { + memcpy(decimal->words, value, decimal->n_words * sizeof(uint64_t)); +} + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_H_INCLUDED +#define NANOARROW_H_INCLUDED + +#include +#include +#include + + + +// If using CMake, optionally pass -DNANOARROW_NAMESPACE=MyNamespace which will set this +// define in nanoarrow_config.h. If not, you can optionally #define NANOARROW_NAMESPACE +// MyNamespace here. + +// This section remaps the non-prefixed symbols to the prefixed symbols so that +// code written against this build can be used independent of the value of +// NANOARROW_NAMESPACE. +#ifdef NANOARROW_NAMESPACE +#define NANOARROW_CAT(A, B) A##B +#define NANOARROW_SYMBOL(A, B) NANOARROW_CAT(A, B) + +#define ArrowNanoarrowVersion NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersion) +#define ArrowNanoarrowVersionInt \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersionInt) +#define ArrowErrorMessage NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorMessage) +#define ArrowMalloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMalloc) +#define ArrowRealloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowRealloc) +#define ArrowFree NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowFree) +#define ArrowBufferAllocatorDefault \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferAllocatorDefault) +#define ArrowBufferDeallocator \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferDeallocator) +#define ArrowErrorSet NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorSet) +#define ArrowLayoutInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowLayoutInit) +#define ArrowSchemaInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInit) +#define ArrowSchemaInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInitFromType) +#define ArrowSchemaSetType NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetType) +#define ArrowSchemaSetTypeStruct \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeStruct) +#define ArrowSchemaSetTypeFixedSize \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize) +#define ArrowSchemaSetTypeDecimal \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal) +#define ArrowSchemaSetTypeDateTime \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime) +#define ArrowSchemaSetTypeUnion \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeUnion) +#define ArrowSchemaDeepCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaDeepCopy) +#define ArrowSchemaSetFormat NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetFormat) +#define ArrowSchemaSetName NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetName) +#define ArrowSchemaSetMetadata \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetMetadata) +#define ArrowSchemaAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateChildren) +#define ArrowSchemaAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateDictionary) +#define ArrowMetadataReaderInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderInit) +#define ArrowMetadataReaderRead \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderRead) +#define ArrowMetadataSizeOf NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataSizeOf) +#define ArrowMetadataHasKey NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataHasKey) +#define ArrowMetadataGetValue NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataGetValue) +#define ArrowMetadataBuilderInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderInit) +#define ArrowMetadataBuilderAppend \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderAppend) +#define ArrowMetadataBuilderSet \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderSet) +#define ArrowMetadataBuilderRemove \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderRemove) +#define ArrowSchemaViewInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaViewInit) +#define ArrowSchemaToString NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaToString) +#define ArrowArrayInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromType) +#define ArrowArrayInitFromSchema \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromSchema) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) +#define ArrowArrayAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateChildren) +#define ArrowArrayAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateDictionary) +#define ArrowArraySetValidityBitmap \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetValidityBitmap) +#define ArrowArraySetBuffer NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetBuffer) +#define ArrowArrayReserve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayReserve) +#define ArrowArrayFinishBuilding \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuilding) +#define ArrowArrayFinishBuildingDefault \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuildingDefault) +#define ArrowArrayViewInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromType) +#define ArrowArrayViewInitFromSchema \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema) +#define ArrowArrayViewAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren) +#define ArrowArrayViewAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary) +#define ArrowArrayViewSetLength \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength) +#define ArrowArrayViewSetArray \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray) +#define ArrowArrayViewSetArrayMinimal \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal) +#define ArrowArrayViewValidate \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate) +#define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) +#define ArrowBasicArrayStreamInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) +#define ArrowBasicArrayStreamSetArray \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamSetArray) +#define ArrowBasicArrayStreamValidate \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamValidate) + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/// \defgroup nanoarrow Nanoarrow C library +/// +/// Except where noted, objects are not thread-safe and clients should +/// take care to serialize accesses to methods. +/// +/// Because this library is intended to be vendored, it provides full type +/// definitions and encourages clients to stack or statically allocate +/// where convenient. + +/// \defgroup nanoarrow-malloc Memory management +/// +/// Non-buffer members of a struct ArrowSchema and struct ArrowArray +/// must be allocated using ArrowMalloc() or ArrowRealloc() and freed +/// using ArrowFree() for schemas and arrays allocated here. Buffer members +/// are allocated using an ArrowBufferAllocator. +/// +/// @{ + +/// \brief Allocate like malloc() +void* ArrowMalloc(int64_t size); + +/// \brief Reallocate like realloc() +void* ArrowRealloc(void* ptr, int64_t size); + +/// \brief Free a pointer allocated using ArrowMalloc() or ArrowRealloc(). +void ArrowFree(void* ptr); + +/// \brief Return the default allocator +/// +/// The default allocator uses ArrowMalloc(), ArrowRealloc(), and +/// ArrowFree(). +struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void); + +/// \brief Create a custom deallocator +/// +/// Creates a buffer allocator with only a free method that can be used to +/// attach a custom deallocator to an ArrowBuffer. This may be used to +/// avoid copying an existing buffer that was not allocated using the +/// infrastructure provided here (e.g., by an R or Python object). +struct ArrowBufferAllocator ArrowBufferDeallocator( + void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, + int64_t size), + void* private_data); + +/// @} + +/// \defgroup nanoarrow-errors Error handling +/// +/// Functions generally return an errno-compatible error code; functions that +/// need to communicate more verbose error information accept a pointer +/// to an ArrowError. This can be stack or statically allocated. The +/// content of the message is undefined unless an error code has been +/// returned. If a nanoarrow function is passed a non-null ArrowError pointer, the +/// ArrowError pointed to by the argument will be propagated with a +/// null-terminated error message. It is safe to pass a NULL ArrowError anywhere +/// in the nanoarrow API. +/// +/// Except where documented, it is generally not safe to continue after a +/// function has returned a non-zero ArrowErrorCode. The NANOARROW_RETURN_NOT_OK and +/// NANOARROW_ASSERT_OK macros are provided to help propagate errors. C++ clients can use +/// the helpers provided in the nanoarrow.hpp header to facilitate using C++ idioms +/// for memory management and error propgagtion. +/// +/// @{ + +/// \brief Error type containing a UTF-8 encoded message. +struct ArrowError { + /// \brief A character buffer with space for an error message. + char message[1024]; +}; + +/// \brief Ensure an ArrowError is null-terminated by zeroing the first character. +/// +/// If error is NULL, this function does nothing. +static inline void ArrowErrorInit(struct ArrowError* error) { + if (error) { + error->message[0] = '\0'; + } +} + +/// \brief Set the contents of an error using printf syntax. +/// +/// If error is NULL, this function does nothing and returns NANOARROW_OK. +ArrowErrorCode ArrowErrorSet(struct ArrowError* error, const char* fmt, ...); + +/// \brief Get the contents of an error +/// +/// If error is NULL, returns "", or returns the contents of the error message +/// otherwise. +const char* ArrowErrorMessage(struct ArrowError* error); + +/// @} + +/// \defgroup nanoarrow-utils Utility data structures +/// +/// @{ + +/// \brief Return a version string in the form "major.minor.patch" +const char* ArrowNanoarrowVersion(void); + +/// \brief Return an integer that can be used to compare versions sequentially +int ArrowNanoarrowVersionInt(void); + +/// \brief Initialize a description of buffer arrangements from a storage type +void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type); + +/// \brief Create a string view from a null-terminated string +static inline struct ArrowStringView ArrowCharView(const char* value); + +/// @} + +/// \defgroup nanoarrow-schema Creating schemas +/// +/// These functions allocate, copy, and destroy ArrowSchema structures +/// +/// @{ + +/// \brief Initialize an ArrowSchema +/// +/// Initializes the fields and release callback of schema_out. Caller +/// is responsible for calling the schema->release callback if +/// NANOARROW_OK is returned. +void ArrowSchemaInit(struct ArrowSchema* schema); + +/// \brief Initialize an ArrowSchema from an ArrowType +/// +/// A convenience constructor for that calls ArrowSchemaInit() and +/// ArrowSchemaSetType() for the common case of constructing an +/// unparameterized type. The caller is responsible for calling the schema->release +/// callback if NANOARROW_OK is returned. +ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type); + +/// \brief Get a human-readable summary of a Schema +/// +/// Writes a summary of an ArrowSchema to out (up to n - 1 characters) +/// and returns the number of characters required for the output if +/// n were sufficiently large. If recursive is non-zero, the result will +/// also include children. +int64_t ArrowSchemaToString(struct ArrowSchema* schema, char* out, int64_t n, + char recursive); + +/// \brief Set the format field of a schema from an ArrowType +/// +/// Initializes the fields and release callback of schema_out. For +/// NANOARROW_TYPE_LIST, NANOARROW_TYPE_LARGE_LIST, and +/// NANOARROW_TYPE_MAP, the appropriate number of children are +/// allocated, initialized, and named; however, the caller must +/// ArrowSchemaSetType() on the preinitialized children. Schema must have been initialized +/// using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type); + +/// \brief Set the format field and initialize children of a struct schema +/// +/// The specified number of children are initialized; however, the caller is responsible +/// for calling ArrowSchemaSetType() and ArrowSchemaSetName() on each child. +/// Schema must have been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children); + +/// \brief Set the format field of a fixed-size schema +/// +/// Returns EINVAL for fixed_size <= 0 or for type that is not +/// NANOARROW_TYPE_FIXED_SIZE_BINARY or NANOARROW_TYPE_FIXED_SIZE_LIST. +/// For NANOARROW_TYPE_FIXED_SIZE_LIST, the appropriate number of children are +/// allocated, initialized, and named; however, the caller must +/// ArrowSchemaSetType() the first child. Schema must have been initialized using +/// ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, + enum ArrowType type, int32_t fixed_size); + +/// \brief Set the format field of a decimal schema +/// +/// Returns EINVAL for scale <= 0 or for type that is not +/// NANOARROW_TYPE_DECIMAL128 or NANOARROW_TYPE_DECIMAL256. Schema must have been +/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, + int32_t decimal_precision, + int32_t decimal_scale); + +/// \brief Set the format field of a time, timestamp, or duration schema +/// +/// Returns EINVAL for type that is not +/// NANOARROW_TYPE_TIME32, NANOARROW_TYPE_TIME64, +/// NANOARROW_TYPE_TIMESTAMP, or NANOARROW_TYPE_DURATION. The +/// timezone parameter must be NULL for a non-timestamp type. Schema must have been +/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, + enum ArrowTimeUnit time_unit, + const char* timezone); + +/// \brief Seet the format field of a union schema +/// +/// Returns EINVAL for a type that is not NANOARROW_TYPE_DENSE_UNION +/// or NANOARROW_TYPE_SPARSE_UNION. The specified number of children are +/// allocated, and initialized. +ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, + int64_t n_children); + +/// \brief Make a (recursive) copy of a schema +/// +/// Allocates and copies fields of schema into schema_out. +ArrowErrorCode ArrowSchemaDeepCopy(struct ArrowSchema* schema, + struct ArrowSchema* schema_out); + +/// \brief Copy format into schema->format +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format); + +/// \brief Copy name into schema->name +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name); + +/// \brief Copy metadata into schema->metadata +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy. +ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata); + +/// \brief Allocate the schema->children array +/// +/// Includes the memory for each child struct ArrowSchema. +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, + int64_t n_children); + +/// \brief Allocate the schema->dictionary member +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema); + +/// @} + +/// \defgroup nanoarrow-metadata Create, read, and modify schema metadata +/// +/// @{ + +/// \brief Reader for key/value pairs in schema metadata +/// +/// The ArrowMetadataReader does not own any data and is only valid +/// for the lifetime of the underlying metadata pointer. +struct ArrowMetadataReader { + /// \brief A metadata string from a schema->metadata field. + const char* metadata; + + /// \brief The current offset into the metadata string + int64_t offset; + + /// \brief The number of remaining keys + int32_t remaining_keys; +}; + +/// \brief Initialize an ArrowMetadataReader +ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, + const char* metadata); + +/// \brief Read the next key/value pair from an ArrowMetadataReader +ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, + struct ArrowStringView* key_out, + struct ArrowStringView* value_out); + +/// \brief The number of bytes in in a key/value metadata string +int64_t ArrowMetadataSizeOf(const char* metadata); + +/// \brief Check for a key in schema metadata +char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key); + +/// \brief Extract a value from schema metadata +/// +/// If key does not exist in metadata, value_out is unmodified +ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, + struct ArrowStringView* value_out); + +/// \brief Initialize a builder for schema metadata from key/value pairs +/// +/// metadata can be an existing metadata string or NULL to initialize +/// an empty metadata string. +ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, const char* metadata); + +/// \brief Append a key/value pair to a buffer containing serialized metadata +ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value); + +/// \brief Set a key/value pair to a buffer containing serialized metadata +/// +/// Ensures that the only entry for key in the metadata is set to value. +/// This function maintains the existing position of (the first instance of) +/// key if present in the data. +ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value); + +/// \brief Remove a key from a buffer containing serialized metadata +ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, + struct ArrowStringView key); + +/// @} + +/// \defgroup nanoarrow-schema-view Reading schemas +/// +/// @{ + +/// \brief A non-owning view of a parsed ArrowSchema +/// +/// Contains more readily extractable values than a raw ArrowSchema. +/// Clients can stack or statically allocate this structure but are +/// encouraged to use the provided getters to ensure forward +/// compatiblity. +struct ArrowSchemaView { + /// \brief A pointer to the schema represented by this view + struct ArrowSchema* schema; + + /// \brief The data type represented by the schema + /// + /// This value may be NANOARROW_TYPE_DICTIONARY if the schema has a + /// non-null dictionary member; datetime types are valid values. + /// This value will never be NANOARROW_TYPE_EXTENSION (see + /// extension_name and/or extension_metadata to check for + /// an extension type). + enum ArrowType type; + + /// \brief The storage data type represented by the schema + /// + /// This value will never be NANOARROW_TYPE_DICTIONARY, NANOARROW_TYPE_EXTENSION + /// or any datetime type. This value represents only the type required to + /// interpret the buffers in the array. + enum ArrowType storage_type; + + /// \brief The storage layout represented by the schema + struct ArrowLayout layout; + + /// \brief The extension type name if it exists + /// + /// If the ARROW:extension:name key is present in schema.metadata, + /// extension_name.data will be non-NULL. + struct ArrowStringView extension_name; + + /// \brief The extension type metadata if it exists + /// + /// If the ARROW:extension:metadata key is present in schema.metadata, + /// extension_metadata.data will be non-NULL. + struct ArrowStringView extension_metadata; + + /// \brief Format fixed size parameter + /// + /// This value is set when parsing a fixed-size binary or fixed-size + /// list schema; this value is undefined for other types. For a + /// fixed-size binary schema this value is in bytes; for a fixed-size + /// list schema this value refers to the number of child elements for + /// each element of the parent. + int32_t fixed_size; + + /// \brief Decimal bitwidth + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_bitwidth; + + /// \brief Decimal precision + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_precision; + + /// \brief Decimal scale + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_scale; + + /// \brief Format time unit parameter + /// + /// This value is set when parsing a date/time type. The value is + /// undefined for other types. + enum ArrowTimeUnit time_unit; + + /// \brief Format timezone parameter + /// + /// This value is set when parsing a timestamp type and represents + /// the timezone format parameter. This value points to + /// data within the schema and is undefined for other types. + const char* timezone; + + /// \brief Union type ids parameter + /// + /// This value is set when parsing a union type and represents + /// type ids parameter. This value points to + /// data within the schema and is undefined for other types. + const char* union_type_ids; +}; + +/// \brief Initialize an ArrowSchemaView +ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, + struct ArrowSchema* schema, struct ArrowError* error); + +/// @} + +/// \defgroup nanoarrow-buffer Owning, growable buffers +/// +/// @{ + +/// \brief Initialize an ArrowBuffer +/// +/// Initialize a buffer with a NULL, zero-size buffer using the default +/// buffer allocator. +static inline void ArrowBufferInit(struct ArrowBuffer* buffer); + +/// \brief Set a newly-initialized buffer's allocator +/// +/// Returns EINVAL if the buffer has already been allocated. +static inline ArrowErrorCode ArrowBufferSetAllocator( + struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator); + +/// \brief Reset an ArrowBuffer +/// +/// Releases the buffer using the allocator's free method if +/// the buffer's data member is non-null, sets the data member +/// to NULL, and sets the buffer's size and capacity to 0. +static inline void ArrowBufferReset(struct ArrowBuffer* buffer); + +/// \brief Move an ArrowBuffer +/// +/// Transfers the buffer data and lifecycle management to another +/// address and resets buffer. +static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst); + +/// \brief Grow or shrink a buffer to a given capacity +/// +/// When shrinking the capacity of the buffer, the buffer is only reallocated +/// if shrink_to_fit is non-zero. Calling ArrowBufferResize() does not +/// adjust the buffer's size member except to ensure that the invariant +/// capacity >= size remains true. +static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, + int64_t new_capacity_bytes, + char shrink_to_fit); + +/// \brief Ensure a buffer has at least a given additional capacity +/// +/// Ensures that the buffer has space to append at least +/// additional_size_bytes, overallocating when required. +static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, + int64_t additional_size_bytes); + +/// \brief Write data to buffer and increment the buffer size +/// +/// This function does not check that buffer has the required capacity +static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, + int64_t size_bytes); + +/// \brief Write data to buffer and increment the buffer size +/// +/// This function writes and ensures that the buffer has the required capacity, +/// possibly by reallocating the buffer. Like ArrowBufferReserve, this will +/// overallocate when reallocation is required. +static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, + const void* data, int64_t size_bytes); + +/// \brief Write fill to buffer and increment the buffer size +/// +/// This function writes the specified number of fill bytes and +/// ensures that the buffer has the required capacity, +static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, + uint8_t value, int64_t size_bytes); + +/// \brief Write an 8-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, + int8_t value); + +/// \brief Write an unsigned 8-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, + uint8_t value); + +/// \brief Write a 16-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, + int16_t value); + +/// \brief Write an unsigned 16-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, + uint16_t value); + +/// \brief Write a 32-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, + int32_t value); + +/// \brief Write an unsigned 32-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, + uint32_t value); + +/// \brief Write a 64-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, + int64_t value); + +/// \brief Write an unsigned 64-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, + uint64_t value); + +/// \brief Write a double to a buffer +static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, + double value); + +/// \brief Write a float to a buffer +static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, + float value); + +/// \brief Write an ArrowStringView to a buffer +static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, + struct ArrowStringView value); + +/// \brief Write an ArrowBufferView to a buffer +static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, + struct ArrowBufferView value); + +/// @} + +/// \defgroup nanoarrow-bitmap Bitmap utilities +/// +/// @{ + +/// \brief Extract a boolean value from a bitmap +static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap to true +static inline void ArrowBitSet(uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap to false +static inline void ArrowBitClear(uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap +static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t value); + +/// \brief Set a boolean value to a range in a bitmap +static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, + uint8_t bits_are_set); + +/// \brief Count true values in a bitmap +static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to); + +/// \brief Initialize an ArrowBitmap +/// +/// Initialize the builder's buffer, empty its cache, and reset the size to zero +static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap); + +/// \brief Move an ArrowBitmap +/// +/// Transfers the underlying buffer data and lifecycle management to another +/// address and resets the bitmap. +static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst); + +/// \brief Ensure a bitmap builder has at least a given additional capacity +/// +/// Ensures that the buffer has space to append at least +/// additional_size_bits, overallocating when required. +static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, + int64_t additional_size_bits); + +/// \brief Grow or shrink a bitmap to a given capacity +/// +/// When shrinking the capacity of the bitmap, the bitmap is only reallocated +/// if shrink_to_fit is non-zero. Calling ArrowBitmapResize() does not +/// adjust the buffer's size member except when shrinking new_capacity_bits +/// to a value less than the current number of bits in the bitmap. +static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, + int64_t new_capacity_bits, + char shrink_to_fit); + +/// \brief Reserve space for and append zero or more of the same boolean value to a bitmap +static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length); + +/// \brief Append zero or more of the same boolean value to a bitmap +static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length); + +/// \brief Append boolean values encoded as int8_t to a bitmap +/// +/// The values must all be 0 or 1. +static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, + const int8_t* values, int64_t n_values); + +/// \brief Append boolean values encoded as int32_t to a bitmap +/// +/// The values must all be 0 or 1. +static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, + const int32_t* values, int64_t n_values); + +/// \brief Reset a bitmap builder +/// +/// Releases any memory held by buffer, empties the cache, and resets the size to zero +static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap); + +/// @} + +/// \defgroup nanoarrow-array Creating arrays +/// +/// These functions allocate, copy, and destroy ArrowArray structures. +/// Once an ArrowArray has been initialized via ArrowArrayInitFromType() +/// or ArrowArrayInitFromSchema(), the caller is responsible for releasing +/// it using the embedded release callback. +/// +/// @{ + +/// \brief Initialize the fields of an array +/// +/// Initializes the fields and release callback of array. Caller +/// is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, + enum ArrowType storage_type); + +/// \brief Initialize the contents of an ArrowArray from an ArrowSchema +/// +/// Caller is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, + struct ArrowSchema* schema, + struct ArrowError* error); + +/// \brief Initialize the contents of an ArrowArray from an ArrowArrayView +/// +/// Caller is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + struct ArrowArrayView* array_view, + struct ArrowError* error); + +/// \brief Allocate the array->children array +/// +/// Includes the memory for each child struct ArrowArray, +/// whose members are marked as released and may be subsequently initialized +/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. +/// schema must have been allocated using ArrowArrayInitFromType(). +ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children); + +/// \brief Allocate the array->dictionary member +/// +/// Includes the memory for the struct ArrowArray, whose contents +/// is marked as released and may be subsequently initialized +/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array); + +/// \brief Set the validity bitmap of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap); + +/// \brief Set a buffer of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, + struct ArrowBuffer* buffer); + +/// \brief Get the validity bitmap of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array); + +/// \brief Get a buffer of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i); + +/// \brief Start element-wise appending to an ArrowArray +/// +/// Initializes any values needed to use ArrowArrayAppend*() functions. +/// All element-wise appenders append by value and return EINVAL if the exact value +/// cannot be represented by the underlying storage type. +/// array must have been allocated using ArrowArrayInitFromType() +static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array); + +/// \brief Reserve space for future appends +/// +/// For buffer sizes that can be calculated (i.e., not string data buffers or +/// child array sizes for non-fixed-size arrays), recursively reserve space for +/// additional elements. This is useful for reducing the number of reallocations +/// that occur using the item-wise appenders. +ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, + int64_t additional_size_elements); + +/// \brief Append a null value to an array +static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n); + +/// \brief Append an empty, non-null value to an array +static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n); + +/// \brief Append a signed integer value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range). +static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, int64_t value); + +/// \brief Append an unsigned integer value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range). +static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, + uint64_t value); + +/// \brief Append a double value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range or there is an attempt to append +/// a non-integer to an array with an integer storage type). +static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, + double value); + +/// \brief Append a string of bytes to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., +/// the underlying array is not a binary, string, large binary, large string, +/// or fixed-size binary array, or value is the wrong size for a fixed-size +/// binary array). +static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, + struct ArrowBufferView value); + +/// \brief Append a string value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., +/// the underlying array is not a string or large string array). +static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, + struct ArrowStringView value); + +/// \brief Append a Interval to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise. +static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, + struct ArrowInterval* value); + +/// \brief Append a decimal value to an array +/// +/// Returns NANOARROW_OK if array is a decimal array with the appropriate +/// bitwidth or EINVAL otherwise. +static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, + struct ArrowDecimal* value); + +/// \brief Finish a nested array element +/// +/// Appends a non-null element to the array based on the first child's current +/// length. Returns NANOARROW_OK if the item was successfully added or EINVAL +/// if the underlying storage type is not a struct, list, large list, or fixed-size +/// list, or if there was an attempt to add a struct or fixed-size list element where the +/// length of the child array(s) did not match the expected length. +static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array); + +/// \brief Finish a union array element +/// +/// Appends an element to the union type ids buffer and increments array->length. +/// For sparse unions, up to one element is added to non type-id children. Returns +/// EINVAL if the underlying storage type is not a union, if type_id is not valid, +/// or if child sizes after appending are inconsistent. +static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, + int8_t type_id); + +/// \brief Shrink buffer capacity to the size required +/// +/// Also applies shrinking to any child arrays. array must have been allocated using +/// ArrowArrayInitFromType +static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array); + +/// \brief Finish building an ArrowArray +/// +/// Flushes any pointers from internal buffers that may have been reallocated +/// into array->buffers and checks the actual size of the buffers +/// against the expected size based on the final length. +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Finish building an ArrowArray with explicit validation +/// +/// Finish building with an explicit validation level. This could perform less validation +/// (i.e. NANOARROW_VALIDATION_LEVEL_NONE or NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU +/// buffer data access is not possible or more validation (i.e., +/// NANOARROW_VALIDATION_LEVEL_FULL) if buffer content was obtained from an untrusted or +/// corruptable source. +ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); + +/// @} + +/// \defgroup nanoarrow-array-view Reading arrays +/// +/// These functions read and validate the contents ArrowArray structures. +/// +/// @{ + +/// \brief Initialize the contents of an ArrowArrayView +void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, + enum ArrowType storage_type); + +/// \brief Move an ArrowArrayView +/// +/// Transfers the ArrowArrayView data and lifecycle management to another +/// address and resets the contents of src. +static inline void ArrowArrayViewMove(struct ArrowArrayView* src, + struct ArrowArrayView* dst); + +/// \brief Initialize the contents of an ArrowArrayView from an ArrowSchema +ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, + struct ArrowSchema* schema, + struct ArrowError* error); + +/// \brief Allocate the array_view->children array +/// +/// Includes the memory for each child struct ArrowArrayView +ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, + int64_t n_children); + +/// \brief Allocate array_view->dictionary +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view); + +/// \brief Set data-independent buffer sizes from length +void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length); + +/// \brief Set buffer sizes and data pointers from an ArrowArray +ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, + struct ArrowArray* array, struct ArrowError* error); + +/// \brief Set buffer sizes and data pointers from an ArrowArray except for those +/// that require dereferencing buffer content. +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Performs checks on the content of an ArrowArrayView +/// +/// If using ArrowArrayViewSetArray() to back array_view with an ArrowArray, +/// the buffer sizes and some content (fist and last offset) have already +/// been validated at the "default" level. If setting the buffer pointers +/// and sizes otherwise, you may wish to perform checks at a different level. See +/// documentation for ArrowValidationLevel for the details of checks performed +/// at each level. +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); + +/// \brief Reset the contents of an ArrowArrayView and frees resources +void ArrowArrayViewReset(struct ArrowArrayView* array_view); + +/// \brief Check for a null element in an ArrowArrayView +static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the type id of a union array element +static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get the child index of a union array element +static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get the index to use into the relevant union child array +static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get an element in an ArrowArrayView as an integer +/// +/// This function does not check for null values, that values are actually integers, or +/// that values are within a valid range for an int64. +static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get an element in an ArrowArrayView as an unsigned integer +/// +/// This function does not check for null values, that values are actually integers, or +/// that values are within a valid range for a uint64. +static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get an element in an ArrowArrayView as a double +/// +/// This function does not check for null values, or +/// that values are within a valid range for a double. +static inline double ArrowArrayViewGetDoubleUnsafe(struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowStringView +/// +/// This function does not check for null values. +static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( + struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowBufferView +/// +/// This function does not check for null values. +static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( + struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowDecimal +/// +/// This function does not check for null values. The out parameter must +/// be initialized with ArrowDecimalInit() with the proper parameters for this +/// type before calling this for the first time. +static inline void ArrowArrayViewGetDecimalUnsafe(struct ArrowArrayView* array_view, + int64_t i, struct ArrowDecimal* out); + +/// @} + +/// \defgroup nanoarrow-basic-array-stream Basic ArrowArrayStream implementation +/// +/// An implementation of an ArrowArrayStream based on a collection of +/// zero or more previously-existing ArrowArray objects. Users should +/// initialize and/or validate the contents before transferring the +/// responsibility of the ArrowArrayStream elsewhere. +/// +/// @{ + +/// \brief Initialize an ArrowArrayStream backed by this implementation +/// +/// This function moves the ownership of schema to the array_stream. If +/// this function returns NANOARROW_OK, the caller is responsible for +/// releasing the ArrowArrayStream. +ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema, int64_t n_arrays); + +/// \brief Set the ith ArrowArray in this ArrowArrayStream. +/// +/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). +/// This function move the ownership of array to the array_stream. i must +/// be greater than zero and less than the value of n_arrays passed in +/// ArrowBasicArrayStreamInit(). Callers are not required to fill all +/// n_arrays members (i.e., n_arrays is a maximum bound). +void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, + struct ArrowArray* array); + +/// \brief Validate the contents of this ArrowArrayStream +/// +/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). +/// This function uses ArrowArrayStreamInitFromSchema() and ArrowArrayStreamSetArray() +/// to validate the contents of the arrays. +ArrowErrorCode ArrowBasicArrayStreamValidate(struct ArrowArrayStream* array_stream, + struct ArrowError* error); + +/// @} + +// Inline function definitions + + + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_BUFFER_INLINE_H_INCLUDED +#define NANOARROW_BUFFER_INLINE_H_INCLUDED + +#include +#include +#include + + + +#ifdef __cplusplus +extern "C" { +#endif + +static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t new_capacity) { + int64_t doubled_capacity = current_capacity * 2; + if (doubled_capacity > new_capacity) { + return doubled_capacity; + } else { + return new_capacity; + } +} + +static inline void ArrowBufferInit(struct ArrowBuffer* buffer) { + buffer->data = NULL; + buffer->size_bytes = 0; + buffer->capacity_bytes = 0; + buffer->allocator = ArrowBufferAllocatorDefault(); +} + +static inline ArrowErrorCode ArrowBufferSetAllocator( + struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator) { + if (buffer->data == NULL) { + buffer->allocator = allocator; + return NANOARROW_OK; + } else { + return EINVAL; + } +} + +static inline void ArrowBufferReset(struct ArrowBuffer* buffer) { + if (buffer->data != NULL) { + buffer->allocator.free(&buffer->allocator, (uint8_t*)buffer->data, + buffer->capacity_bytes); + buffer->data = NULL; + } + + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; +} + +static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst) { + memcpy(dst, src, sizeof(struct ArrowBuffer)); + src->data = NULL; + ArrowBufferReset(src); +} + +static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, + int64_t new_capacity_bytes, + char shrink_to_fit) { + if (new_capacity_bytes < 0) { + return EINVAL; + } + + if (new_capacity_bytes > buffer->capacity_bytes || shrink_to_fit) { + buffer->data = buffer->allocator.reallocate( + &buffer->allocator, buffer->data, buffer->capacity_bytes, new_capacity_bytes); + if (buffer->data == NULL && new_capacity_bytes > 0) { + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; + return ENOMEM; + } + + buffer->capacity_bytes = new_capacity_bytes; + } + + // Ensures that when shrinking that size <= capacity + if (new_capacity_bytes < buffer->size_bytes) { + buffer->size_bytes = new_capacity_bytes; + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, + int64_t additional_size_bytes) { + int64_t min_capacity_bytes = buffer->size_bytes + additional_size_bytes; + if (min_capacity_bytes <= buffer->capacity_bytes) { + return NANOARROW_OK; + } + + return ArrowBufferResize( + buffer, _ArrowGrowByFactor(buffer->capacity_bytes, min_capacity_bytes), 0); +} + +static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, + int64_t size_bytes) { + if (size_bytes > 0) { + memcpy(buffer->data + buffer->size_bytes, data, size_bytes); + buffer->size_bytes += size_bytes; + } +} + +static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, + const void* data, int64_t size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); + + ArrowBufferAppendUnsafe(buffer, data, size_bytes); + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, + int8_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int8_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, + uint8_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint8_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, + int16_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int16_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, + uint16_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint16_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, + int32_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int32_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, + uint32_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint32_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, + int64_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int64_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, + uint64_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint64_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, + double value) { + return ArrowBufferAppend(buffer, &value, sizeof(double)); +} + +static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, + float value) { + return ArrowBufferAppend(buffer, &value, sizeof(float)); +} + +static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, + struct ArrowStringView value) { + return ArrowBufferAppend(buffer, value.data, value.size_bytes); +} + +static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, + struct ArrowBufferView value) { + return ArrowBufferAppend(buffer, value.data.data, value.size_bytes); +} + +static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, + uint8_t value, int64_t size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); + + memset(buffer->data + buffer->size_bytes, value, size_bytes); + buffer->size_bytes += size_bytes; + return NANOARROW_OK; +} + +static const uint8_t _ArrowkBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; +static const uint8_t _ArrowkFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127}; +static const uint8_t _ArrowkPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; +static const uint8_t _ArrowkTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128}; + +static const uint8_t _ArrowkBytePopcount[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, + 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, + 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, + 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, + 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, + 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, + 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, + 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; + +static inline int64_t _ArrowRoundUpToMultipleOf8(int64_t value) { + return (value + 7) & ~((int64_t)7); +} + +static inline int64_t _ArrowRoundDownToMultipleOf8(int64_t value) { + return (value / 8) * 8; +} + +static inline int64_t _ArrowBytesForBits(int64_t bits) { + return (bits >> 3) + ((bits & 7) != 0); +} + +static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { + *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | values[4] << 4 | + values[5] << 5 | values[6] << 6 | values[7] << 7); +} + +static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) { + *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | values[4] << 4 | + values[5] << 5 | values[6] << 6 | values[7] << 7); +} + +static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { + return (bits[i >> 3] >> (i & 0x07)) & 1; +} + +static inline void ArrowBitSet(uint8_t* bits, int64_t i) { + bits[i / 8] |= _ArrowkBitmask[i % 8]; +} + +static inline void ArrowBitClear(uint8_t* bits, int64_t i) { + bits[i / 8] &= _ArrowkFlippedBitmask[i % 8]; +} + +static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t bit_is_set) { + bits[i / 8] ^= + ((uint8_t)(-((uint8_t)(bit_is_set != 0)) ^ bits[i / 8])) & _ArrowkBitmask[i % 8]; +} + +static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, + uint8_t bits_are_set) { + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const uint8_t fill_byte = (uint8_t)(-bits_are_set); + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_end = i_end / 8 + 1; + + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; + const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; + + if (bytes_end == bytes_begin + 1) { + // set bits within a single byte + const uint8_t only_byte_mask = + i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); + bits[bytes_begin] &= only_byte_mask; + bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); + return; + } + + // set/clear trailing bits of first byte + bits[bytes_begin] &= first_byte_mask; + bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); + + if (bytes_end - bytes_begin > 2) { + // set/clear whole bytes + memset(bits + bytes_begin + 1, fill_byte, (size_t)(bytes_end - bytes_begin - 2)); + } + + if (i_end % 8 == 0) { + return; + } + + // set/clear leading bits of last byte + bits[bytes_end - 1] &= last_byte_mask; + bits[bytes_end - 1] |= (uint8_t)(fill_byte & ~last_byte_mask); +} + +static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t start_offset, + int64_t length) { + if (length == 0) { + return 0; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + // count bits within a single byte + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; + const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; + + const uint8_t only_byte_mask = + i_end % 8 == 0 ? last_byte_mask : (uint8_t)(first_byte_mask & last_byte_mask); + + const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; + return _ArrowkBytePopcount[byte_masked]; + } + + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; + const uint8_t last_byte_mask = i_end % 8 == 0 ? 0 : _ArrowkTrailingBitmask[i_end % 8]; + int64_t count = 0; + + // first byte + count += _ArrowkBytePopcount[bits[bytes_begin] & ~first_byte_mask]; + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + count += _ArrowkBytePopcount[bits[i]]; + } + + // last byte + count += _ArrowkBytePopcount[bits[bytes_last_valid] & ~last_byte_mask]; + + return count; +} + +static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap) { + ArrowBufferInit(&bitmap->buffer); + bitmap->size_bits = 0; +} + +static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst) { + ArrowBufferMove(&src->buffer, &dst->buffer); + dst->size_bits = src->size_bits; + src->size_bits = 0; +} + +static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, + int64_t additional_size_bits) { + int64_t min_capacity_bits = bitmap->size_bits + additional_size_bits; + if (min_capacity_bits <= (bitmap->buffer.capacity_bytes * 8)) { + return NANOARROW_OK; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferReserve(&bitmap->buffer, _ArrowBytesForBits(additional_size_bits))); + + bitmap->buffer.data[bitmap->buffer.capacity_bytes - 1] = 0; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, + int64_t new_capacity_bits, + char shrink_to_fit) { + if (new_capacity_bits < 0) { + return EINVAL; + } + + int64_t new_capacity_bytes = _ArrowBytesForBits(new_capacity_bits); + NANOARROW_RETURN_NOT_OK( + ArrowBufferResize(&bitmap->buffer, new_capacity_bytes, shrink_to_fit)); + + if (new_capacity_bits < bitmap->size_bits) { + bitmap->size_bits = new_capacity_bits; + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(bitmap, length)); + + ArrowBitmapAppendUnsafe(bitmap, bits_are_set, length); + return NANOARROW_OK; +} + +static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length) { + ArrowBitsSetTo(bitmap->buffer.data, bitmap->size_bits, length, bits_are_set); + bitmap->size_bits += length; + bitmap->buffer.size_bytes = _ArrowBytesForBits(bitmap->size_bits); +} + +static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, + const int8_t* values, int64_t n_values) { + if (n_values == 0) { + return; + } + + const int8_t* values_cursor = values; + int64_t n_remaining = n_values; + int64_t out_i_cursor = bitmap->size_bits; + uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + + // First byte + if ((out_i_cursor % 8) != 0) { + int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; + for (int i = 0; i < n_partial_bits; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); + } + + out_cursor++; + values_cursor += n_partial_bits; + n_remaining -= n_partial_bits; + } + + // Middle bytes + int64_t n_full_bytes = n_remaining / 8; + for (int64_t i = 0; i < n_full_bytes; i++) { + _ArrowBitmapPackInt8(values_cursor, out_cursor); + values_cursor += 8; + out_cursor++; + } + + // Last byte + out_i_cursor += n_full_bytes * 8; + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte + *out_cursor = 0x00; + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + } + out_cursor++; + } + + bitmap->size_bits += n_values; + bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; +} + +static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, + const int32_t* values, int64_t n_values) { + if (n_values == 0) { + return; + } + + const int32_t* values_cursor = values; + int64_t n_remaining = n_values; + int64_t out_i_cursor = bitmap->size_bits; + uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + + // First byte + if ((out_i_cursor % 8) != 0) { + int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; + for (int i = 0; i < n_partial_bits; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); + } + + out_cursor++; + values_cursor += n_partial_bits; + n_remaining -= n_partial_bits; + } + + // Middle bytes + int64_t n_full_bytes = n_remaining / 8; + for (int64_t i = 0; i < n_full_bytes; i++) { + _ArrowBitmapPackInt32(values_cursor, out_cursor); + values_cursor += 8; + out_cursor++; + } + + // Last byte + out_i_cursor += n_full_bytes * 8; + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte + *out_cursor = 0x00; + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + } + out_cursor++; + } + + bitmap->size_bits += n_values; + bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; +} + +static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap) { + ArrowBufferReset(&bitmap->buffer); + bitmap->size_bits = 0; +} + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_ARRAY_INLINE_H_INCLUDED +#define NANOARROW_ARRAY_INLINE_H_INCLUDED + +#include +#include +#include +#include +#include + + + + +#ifdef __cplusplus +extern "C" { +#endif + +static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + return &private_data->bitmap; +} + +static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + switch (i) { + case 0: + return &private_data->bitmap.buffer; + default: + return private_data->buffers + i - 1; + } +} + +// We don't currently support the case of unions where type_id != child_index; +// however, these functions are used to keep track of where that assumption +// is made. +static inline int8_t _ArrowArrayUnionChildIndex(struct ArrowArray* array, + int8_t type_id) { + return type_id; +} + +static inline int8_t _ArrowArrayUnionTypeId(struct ArrowArray* array, + int8_t child_index) { + return child_index; +} + +static inline int8_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) { + if (*type_ids == '\0') { + return 0; + } + + int32_t i = 0; + long type_id; + char* end_ptr; + do { + type_id = strtol(type_ids, &end_ptr, 10); + if (end_ptr == type_ids || type_id < 0 || type_id > 127) { + return -1; + } + + if (out != NULL) { + out[i] = (int8_t)type_id; + } + + i++; + + type_ids = end_ptr; + if (*type_ids == '\0') { + return i; + } else if (*type_ids != ',') { + return -1; + } else { + type_ids++; + } + } while (1); + + return -1; +} + +static inline int8_t _ArrowParsedUnionTypeIdsWillEqualChildIndices(const int8_t* type_ids, + int64_t n_type_ids, + int64_t n_children) { + if (n_type_ids != n_children) { + return 0; + } + + for (int8_t i = 0; i < n_type_ids; i++) { + if (type_ids[i] != i) { + return 0; + } + } + + return 1; +} + +static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices(const char* type_id_str, + int64_t n_children) { + int8_t type_ids[128]; + int8_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids); + return _ArrowParsedUnionTypeIdsWillEqualChildIndices(type_ids, n_type_ids, n_children); +} + +static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + return EINVAL; + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + // Note that this value could be -1 if the type_ids string was invalid + if (private_data->union_type_id_is_child_index != 1) { + return EINVAL; + } else { + break; + } + default: + break; + } + if (private_data->storage_type == NANOARROW_TYPE_UNINITIALIZED) { + return EINVAL; + } + + // Initialize any data offset buffer with a single zero + for (int i = 0; i < 3; i++) { + if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && + private_data->layout.element_size_bits[i] == 64) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); + } else if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && + private_data->layout.element_size_bits[i] == 32) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(ArrowArrayBuffer(array, i), 0)); + } + } + + // Start building any child arrays or dictionaries + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary)); + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { + for (int64_t i = 0; i < 3; i++) { + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); + NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, buffer->size_bytes, 1)); + } + + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary)); + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode _ArrowArrayAppendBits(struct ArrowArray* array, + int64_t buffer_i, uint8_t value, + int64_t n) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); + int64_t bytes_required = + _ArrowRoundUpToMultipleOf8(private_data->layout.element_size_bits[buffer_i] * + (array->length + 1)) / + 8; + if (bytes_required > buffer->size_bytes) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(buffer, 0, bytes_required - buffer->size_bytes)); + } + + ArrowBitsSetTo(buffer->data, array->length, n, value); + return NANOARROW_OK; +} + +static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* array, + int64_t n, uint8_t is_valid) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + if (n == 0) { + return NANOARROW_OK; + } + + // Some type-specific handling + switch (private_data->storage_type) { + case NANOARROW_TYPE_NA: + // (An empty value for a null array *is* a null) + array->null_count += n; + array->length += n; + return NANOARROW_OK; + + case NANOARROW_TYPE_DENSE_UNION: { + // Add one null to the first child and append n references to that child + int8_t type_id = _ArrowArrayUnionTypeId(array, 0); + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendEmptyInternal(array->children[0], 1, is_valid)); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); + for (int64_t i = 0; i < n; i++) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), (int32_t)array->children[0]->length - 1)); + } + // For the purposes of array->null_count, union elements are never considered "null" + // even if some children contain nulls. + array->length += n; + return NANOARROW_OK; + } + + case NANOARROW_TYPE_SPARSE_UNION: { + // Add n nulls to the first child and append n references to that child + int8_t type_id = _ArrowArrayUnionTypeId(array, 0); + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendEmptyInternal(array->children[0], n, is_valid)); + for (int64_t i = 1; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); + // For the purposes of array->null_count, union elements are never considered "null" + // even if some children contain nulls. + array->length += n; + return NANOARROW_OK; + } + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty( + array->children[0], n * private_data->layout.child_size_elements)); + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); + } + break; + + default: + break; + } + + // Append n is_valid bits to the validity bitmap. If we haven't allocated a bitmap yet + // and we need to append nulls, do it now. + if (!is_valid && private_data->bitmap.buffer.data == NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, array->length + n)); + ArrowBitmapAppendUnsafe(&private_data->bitmap, 1, array->length); + ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); + } else if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, n)); + ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); + } + + // Add appropriate buffer fill + struct ArrowBuffer* buffer; + int64_t size_bytes; + + for (int i = 0; i < 3; i++) { + buffer = ArrowArrayBuffer(array, i); + size_bytes = private_data->layout.element_size_bits[i] / 8; + + switch (private_data->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_NONE: + case NANOARROW_BUFFER_TYPE_VALIDITY: + continue; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Append the current value at the end of the offset buffer for each element + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes * n)); + + for (int64_t j = 0; j < n; j++) { + ArrowBufferAppendUnsafe(buffer, buffer->data + size_bytes * (array->length + j), + size_bytes); + } + + // Skip the data buffer + i++; + continue; + case NANOARROW_BUFFER_TYPE_DATA: + // Zero out the next bit of memory + if (private_data->layout.element_size_bits[i] % 8 == 0) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFill(buffer, 0, size_bytes * n)); + } else { + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, i, 0, n)); + } + continue; + + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + // These cases return above + return EINVAL; + } + } + + array->length += n; + array->null_count += n * !is_valid; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n) { + return _ArrowArrayAppendEmptyInternal(array, n, 0); +} + +static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n) { + return _ArrowArrayAppendEmptyInternal(array, n, 1); +} + +static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, + int64_t value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INT64: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(int64_t))); + break; + case NANOARROW_TYPE_INT32: + _NANOARROW_CHECK_RANGE(value, INT32_MIN, INT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, (int32_t)value)); + break; + case NANOARROW_TYPE_INT16: + _NANOARROW_CHECK_RANGE(value, INT16_MIN, INT16_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt16(data_buffer, (int16_t)value)); + break; + case NANOARROW_TYPE_INT8: + _NANOARROW_CHECK_RANGE(value, INT8_MIN, INT8_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(data_buffer, (int8_t)value)); + break; + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT8: + _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); + return ArrowArrayAppendUInt(array, value); + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_BOOL: + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, + uint64_t value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_UINT64: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(uint64_t))); + break; + case NANOARROW_TYPE_UINT32: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt32(data_buffer, (uint32_t)value)); + break; + case NANOARROW_TYPE_UINT16: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT16_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt16(data_buffer, (uint16_t)value)); + break; + case NANOARROW_TYPE_UINT8: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT8_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(data_buffer, (uint8_t)value)); + break; + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT8: + _NANOARROW_CHECK_UPPER_LIMIT(value, INT64_MAX); + return ArrowArrayAppendInt(array, value); + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_BOOL: + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, + double value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(double))); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, + struct ArrowBufferView value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); + struct ArrowBuffer* data_buffer = ArrowArrayBuffer( + array, 1 + (private_data->storage_type != NANOARROW_TYPE_FIXED_SIZE_BINARY)); + int32_t offset; + int64_t large_offset; + int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + offset = ((int32_t*)offset_buffer->data)[array->length]; + if ((offset + value.size_bytes) > INT32_MAX) { + return EINVAL; + } + + offset += (int32_t)value.size_bytes; + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + large_offset = ((int64_t*)offset_buffer->data)[array->length]; + large_offset += value.size_bytes; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(offset_buffer, &large_offset, sizeof(int64_t))); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + if (value.size_bytes != fixed_size_bytes) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, + struct ArrowStringView value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBufferView buffer_view; + buffer_view.data.data = value.data; + buffer_view.size_bytes = value.size_bytes; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + return ArrowArrayAppendBytes(array, buffer_view); + default: + return EINVAL; + } +} + +static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, + struct ArrowInterval* value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTHS) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + if (value->type != NANOARROW_TYPE_INTERVAL_DAY_TIME) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->ms)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(data_buffer, value->ns)); + break; + } + default: + return EINVAL; + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, + struct ArrowDecimal* value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DECIMAL128: + if (value->n_words != 2) { + return EINVAL; + } else { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value->words, 2 * sizeof(uint64_t))); + break; + } + case NANOARROW_TYPE_DECIMAL256: + if (value->n_words != 4) { + return EINVAL; + } else { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value->words, 4 * sizeof(uint64_t))); + break; + } + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + int64_t child_length; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + child_length = array->children[0]->length; + if (child_length > INT32_MAX) { + return EINVAL; + } + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(ArrowArrayBuffer(array, 1), (int32_t)child_length)); + break; + case NANOARROW_TYPE_LARGE_LIST: + child_length = array->children[0]->length; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt64(ArrowArrayBuffer(array, 1), child_length)); + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_length = array->children[0]->length; + if (child_length != + ((array->length + 1) * private_data->layout.child_size_elements)) { + return EINVAL; + } + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array->n_children; i++) { + child_length = array->children[i]->length; + if (child_length != (array->length + 1)) { + return EINVAL; + } + } + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, + int8_t type_id) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + int64_t child_index = _ArrowArrayUnionChildIndex(array, type_id); + if (child_index < 0 || child_index >= array->n_children) { + return EINVAL; + } + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + // Apppend the target child length to the union offsets buffer + _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0, INT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), (int32_t)array->children[child_index]->length - 1)); + break; + case NANOARROW_TYPE_SPARSE_UNION: + // Append one empty to any non-target column that isn't already the right length + // or abort if appending a null will result in a column with invalid length + for (int64_t i = 0; i < array->n_children; i++) { + if (i == child_index || array->children[i]->length == (array->length + 1)) { + continue; + } + + if (array->children[i]->length != array->length) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], 1)); + } + + break; + default: + return EINVAL; + } + + // Write to the type_ids buffer + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt8(ArrowArrayBuffer(array, 0), (int8_t)type_id)); + array->length++; + return NANOARROW_OK; +} + +static inline void ArrowArrayViewMove(struct ArrowArrayView* src, + struct ArrowArrayView* dst) { + memcpy(dst, src, sizeof(struct ArrowArrayView)); + ArrowArrayViewInitFromType(src, NANOARROW_TYPE_UNINITIALIZED); +} + +static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int64_t i) { + const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; + i += array_view->offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_NA: + return 0x01; + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + // Unions are "never null" in Arrow land + return 0x00; + default: + return validity_buffer != NULL && !ArrowBitGet(validity_buffer, i); + } +} + +static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view, + int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + return array_view->buffer_views[0].data.as_int8[i]; + default: + return -1; + } +} + +static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_view, + int64_t i) { + int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i); + if (array_view->union_type_id_map == NULL) { + return type_id; + } else { + return array_view->union_type_id_map[type_id]; + } +} + +static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* array_view, + int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_SPARSE_UNION: + return i; + default: + return -1; + } +} + +static inline int64_t ArrowArrayViewListChildOffset(struct ArrowArrayView* array_view, + int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_LARGE_LIST: + return array_view->buffer_views[1].data.as_int64[i]; + default: + return -1; + } +} + +static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_view, + int64_t i) { + struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + i += array_view->offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return (int64_t)data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return (int64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return INT64_MAX; + } +} + +static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_view, + int64_t i) { + i += array_view->offset; + struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return (uint64_t)data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return (uint64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return UINT64_MAX; + } +} + +static inline double ArrowArrayViewGetDoubleUnsafe(struct ArrowArrayView* array_view, + int64_t i) { + i += array_view->offset; + struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return (double)data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return (double)data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return DBL_MAX; + } +} + +static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( + struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const char* data_view = array_view->buffer_views[2].data.as_char; + + struct ArrowStringView view; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + view.data = data_view + offsets_view->data.as_int32[i]; + view.size_bytes = + offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; + break; + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + view.data = data_view + offsets_view->data.as_int64[i]; + view.size_bytes = + offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + view.size_bytes = array_view->layout.element_size_bits[1] / 8; + view.data = array_view->buffer_views[1].data.as_char + (i * view.size_bytes); + break; + default: + view.data = NULL; + view.size_bytes = 0; + break; + } + + return view; +} + +static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( + struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const uint8_t* data_view = array_view->buffer_views[2].data.as_uint8; + + struct ArrowBufferView view; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + view.size_bytes = + offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; + view.data.as_uint8 = data_view + offsets_view->data.as_int32[i]; + break; + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + view.size_bytes = + offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; + view.data.as_uint8 = data_view + offsets_view->data.as_int64[i]; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + view.size_bytes = array_view->layout.element_size_bits[1] / 8; + view.data.as_uint8 = + array_view->buffer_views[1].data.as_uint8 + (i * view.size_bytes); + break; + default: + view.data.data = NULL; + view.size_bytes = 0; + break; + } + + return view; +} + +static inline void ArrowArrayViewGetIntervalUnsafe(struct ArrowArrayView* array_view, + int64_t i, struct ArrowInterval* out) { + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + const size_t size = sizeof(int32_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + const size_t size = sizeof(int32_t) + sizeof(int32_t); + memcpy(&out->days, data_view + i * size, sizeof(int32_t)); + memcpy(&out->ms, data_view + i * size + 4, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + const size_t size = sizeof(int32_t) + sizeof(int32_t) + sizeof(int64_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + memcpy(&out->days, data_view + i * size + 4, sizeof(int32_t)); + memcpy(&out->ns, data_view + i * size + 8, sizeof(int64_t)); + break; + } + default: + break; + } +} + +static inline void ArrowArrayViewGetDecimalUnsafe(struct ArrowArrayView* array_view, + int64_t i, struct ArrowDecimal* out) { + i += array_view->offset; + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_DECIMAL128: + ArrowDecimalSetBytes(out, data_view + (i * 16)); + break; + case NANOARROW_TYPE_DECIMAL256: + ArrowDecimalSetBytes(out, data_view + (i * 32)); + break; + default: + memset(out->words, 0, sizeof(out->words)); + break; + } +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index f302c649bc7bd..fabcd220b8b30 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -62,7 +62,7 @@ libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper, _khash_primitive_helper]}, - 'arrays': {'sources': ['arrays.pyx']}, + 'arrays': {'sources': ['arrays.pyx', 'src/vendored/nanoarrow.c']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, 'hashtable': {'sources': ['hashtable.pyx', _khash_primitive_helper, _hashtable_class_helper, _hashtable_func_helper]}, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index bec875f2bbfa1..8dc9e55e8a1de 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -15,6 +15,7 @@ lib, missing as libmissing, ) +from pandas._libs.arrays import BitMaskArray from pandas._libs.tslibs import ( get_unit_from_dtype, is_supported_unit, @@ -112,7 +113,7 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): _internal_fill_value: Scalar # our underlying data and mask are each ndarrays _data: np.ndarray - _mask: npt.NDArray[np.bool_] + _mask: BitMaskArray # Fill values used for any/all _truthy_value = Scalar # bool(_truthy_value) = True @@ -122,7 +123,7 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self: result = BaseMaskedArray.__new__(cls) result._data = values - result._mask = mask + result._mask = BitMaskArray(mask) return result def __init__( @@ -142,7 +143,7 @@ def __init__( mask = mask.copy() self._data = values - self._mask = mask + self._mask = BitMaskArray(mask) @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: @@ -181,6 +182,8 @@ def __getitem__(self, item: SequenceIndexer) -> Self: def __getitem__(self, item: PositionalIndexer) -> Self | Any: item = check_array_indexer(self, item) + # TODO: need to change this to special case multiple + # indexers versus just scalar newmask = self._mask[item] if is_bool(newmask): # This is a scalar indexing @@ -204,7 +207,7 @@ def pad_or_backfill( func = missing.get_fill_func(method, ndim=self.ndim) npvalues = self._data.T - new_mask = mask.T + new_mask = mask.to_numpy().T if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() @@ -226,7 +229,7 @@ def fillna( ) -> Self: value, method = validate_fillna_kwargs(value, method) - mask = self._mask + mask = self._mask.to_numpy() value = missing.check_value_size(value, mask, len(self)) @@ -234,7 +237,7 @@ def fillna( if method is not None: func = missing.get_fill_func(method, ndim=self.ndim) npvalues = self._data.T - new_mask = mask.T + new_mask = mask.to_numpy().T if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() @@ -308,7 +311,8 @@ def __contains__(self, key) -> bool: if isna(key) and key is not self.dtype.na_value: # GH#52840 if self._data.dtype.kind == "f" and lib.is_float(key): - return bool((np.isnan(self._data) & ~self._mask).any()) + # TODO: implement low level invert operator on BitMaskArray + return bool((np.isnan(self._data) & ~self._mask.to_numpy()).any()) return bool(super().__contains__(key)) @@ -319,7 +323,7 @@ def __iter__(self) -> Iterator: yield val else: na_value = self.dtype.na_value - for isna_, val in zip(self._mask, self._data): + for isna_, val in zip(self._mask.to_numpy(), self._data): if isna_: yield na_value else: @@ -341,28 +345,28 @@ def ndim(self) -> int: def swapaxes(self, axis1, axis2) -> Self: data = self._data.swapaxes(axis1, axis2) - mask = self._mask.swapaxes(axis1, axis2) + mask = self._mask.to_numpy().swapaxes(axis1, axis2) return self._simple_new(data, mask) def delete(self, loc, axis: AxisInt = 0) -> Self: data = np.delete(self._data, loc, axis=axis) - mask = np.delete(self._mask, loc, axis=axis) + mask = np.delete(self._mask.to_numpy(), loc, axis=axis) return self._simple_new(data, mask) def reshape(self, *args, **kwargs) -> Self: data = self._data.reshape(*args, **kwargs) - mask = self._mask.reshape(*args, **kwargs) + mask = self._mask.to_numpy().reshape(*args, **kwargs) return self._simple_new(data, mask) def ravel(self, *args, **kwargs) -> Self: # TODO: need to make sure we have the same order for data/mask data = self._data.ravel(*args, **kwargs) - mask = self._mask.ravel(*args, **kwargs) + mask = self._mask.to_numpy().ravel(*args, **kwargs) return type(self)(data, mask) @property def T(self) -> Self: - return self._simple_new(self._data.T, self._mask.T) + return self._simple_new(self._data.T, self._mask.to_numpy().T) def round(self, decimals: int = 0, *args, **kwargs): """ @@ -392,22 +396,22 @@ def round(self, decimals: int = 0, *args, **kwargs): values = np.round(self._data, decimals=decimals, **kwargs) # Usually we'll get same type as self, but ndarray[bool] casts to float - return self._maybe_mask_result(values, self._mask.copy()) + return self._maybe_mask_result(values, self._mask.to_numpy().copy()) # ------------------------------------------------------------------ # Unary Methods def __invert__(self) -> Self: - return self._simple_new(~self._data, self._mask.copy()) + return self._simple_new(~self._data, self._mask.to_numpy().copy()) def __neg__(self) -> Self: - return self._simple_new(-self._data, self._mask.copy()) + return self._simple_new(-self._data, self._mask.to_numpy().copy()) def __pos__(self) -> Self: return self.copy() def __abs__(self) -> Self: - return self._simple_new(abs(self._data), self._mask.copy()) + return self._simple_new(abs(self._data), self._mask.to_numpy().copy()) # ------------------------------------------------------------------ @@ -498,7 +502,7 @@ def to_numpy( with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) data = self._data.astype(dtype) - data[self._mask] = na_value + data[self._mask.to_numpy()] = na_value else: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) @@ -541,7 +545,11 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: data = self._data.astype(dtype.numpy_dtype, copy=copy) # mask is copied depending on whether the data was copied, and # not directly depending on the `copy` keyword - mask = self._mask if data is self._data else self._mask.copy() + mask = ( + self._mask.to_numpy() + if data is self._data + else self._mask.to_numpy().copy() + ) cls = dtype.construct_array_type() return cls(data, mask, copy=False) @@ -652,7 +660,7 @@ def reconstruct(x: np.ndarray): return tuple(reconstruct(x) for x in result) elif method == "reduce": # e.g. np.add.reduce; test_ufunc_reduce_raises - if self._mask.any(): + if self._mask.to_numpy().any(): return self._na_value return result else: @@ -664,7 +672,7 @@ def __arrow_array__(self, type=None): """ import pyarrow as pa - return pa.array(self._data, mask=self._mask, type=type) + return pa.array(self._data, mask=self._mask.to_numpy(), type=type) @property def _hasna(self) -> bool: @@ -673,20 +681,22 @@ def _hasna(self) -> bool: # source code using it.. # error: Incompatible return value type (got "bool_", expected "bool") - return self._mask.any() # type: ignore[return-value] + return self._mask.to_numpy().any() # type: ignore[return-value] def _propagate_mask( self, mask: npt.NDArray[np.bool_] | None, other ) -> npt.NDArray[np.bool_]: if mask is None: - mask = self._mask.copy() # TODO: need test for BooleanArray needing a copy + mask = ( + self._mask.to_numpy().copy() + ) # TODO: need test for BooleanArray needing a copy if other is libmissing.NA: # GH#45421 don't alter inplace mask = mask | True elif is_list_like(other) and len(other) == len(mask): mask = mask | isna(other) else: - mask = self._mask | mask + mask = self._mask.to_numpy() | mask # Incompatible return value type (got "Optional[ndarray[Any, dtype[bool_]]]", # expected "ndarray[Any, dtype[bool_]]") return mask # type: ignore[return-value] @@ -766,7 +776,7 @@ def _arith_method(self, other, op): if op_name == "pow": # 1 ** x is 1. - mask = np.where((self._data == 1) & ~self._mask, False, mask) + mask = np.where((self._data == 1) & ~self._mask.to_numpy(), False, mask) # x ** 0 is 1. if omask is not None: mask = np.where((other == 0) & ~omask, False, mask) @@ -780,7 +790,7 @@ def _arith_method(self, other, op): elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. - mask = np.where((self._data == 0) & ~self._mask, False, mask) + mask = np.where((self._data == 0) & ~self._mask.to_numpy(), False, mask) return self._maybe_mask_result(result, mask) @@ -876,7 +886,7 @@ def _maybe_mask_result( return result def isna(self) -> np.ndarray: - return self._mask.copy() + return self._mask.to_numpy().copy() @property def _na_value(self): @@ -916,7 +926,11 @@ def take( ) mask = take( - self._mask, indexer, fill_value=True, allow_fill=allow_fill, axis=axis + self._mask.to_numpy(), + indexer, + fill_value=True, + allow_fill=allow_fill, + axis=axis, ) # if we are filling @@ -947,14 +961,14 @@ def isin(self, values) -> BooleanArray: # type: ignore[override] # For now, NA does not propagate so set result according to presence of NA, # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion - result[self._mask] = values_have_NA + result[self._mask.to_numpy()] = values_have_NA mask = np.zeros(self._data.shape, dtype=bool) return BooleanArray(result, mask, copy=False) def copy(self) -> Self: data = self._data.copy() - mask = self._mask.copy() + mask = self._mask.to_numpy().copy() return self._simple_new(data, mask) def unique(self) -> Self: @@ -965,7 +979,7 @@ def unique(self) -> Self: ------- uniques : BaseMaskedArray """ - uniques, mask = algos.unique_with_mask(self._data, self._mask) + uniques, mask = algos.unique_with_mask(self._data, self._mask.to_numpy()) return self._simple_new(uniques, mask) @doc(ExtensionArray.searchsorted) @@ -991,7 +1005,7 @@ def factorize( use_na_sentinel: bool = True, ) -> tuple[np.ndarray, ExtensionArray]: arr = self._data - mask = self._mask + mask = self._mask.to_numpy() # Use a sentinel for na; recode and add NA to uniques if necessary below codes, uniques = factorize_array(arr, use_na_sentinel=True, mask=mask) @@ -1050,7 +1064,7 @@ def value_counts(self, dropna: bool = True) -> Series: from pandas.arrays import IntegerArray keys, value_counts = algos.value_counts_arraylike( - self._data, dropna=True, mask=self._mask + self._data, dropna=True, mask=self._mask.to_numpy() ) if dropna: @@ -1062,7 +1076,7 @@ def value_counts(self, dropna: bool = True) -> Series: # if we want nans, count the mask counts = np.empty(len(value_counts) + 1, dtype="int64") counts[:-1] = value_counts - counts[-1] = self._mask.sum() + counts[-1] = self._mask.to_numpy().sum() index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value) index = index.astype(self.dtype) @@ -1081,11 +1095,11 @@ def equals(self, other) -> bool: # GH#44382 if e.g. self[1] is np.nan and other[1] is pd.NA, we are NOT # equal. - if not np.array_equal(self._mask, other._mask): + if not np.array_equal(self._mask.to_numpy(), other._mask): return False - left = self._data[~self._mask] - right = other._data[~other._mask] + left = self._data[~self._mask.to_numpy()] + right = other._data[~other._mask.to_numpy()] return array_equivalent(left, right, strict_nan=True, dtype_equal=True) def _quantile( @@ -1101,7 +1115,7 @@ def _quantile( """ res = quantile_with_mask( self._data, - mask=self._mask, + mask=self._mask.to_numpy(), # TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype) # instead of np.nan fill_value=np.nan, @@ -1140,7 +1154,7 @@ def _reduce( else: # median, skew, kurt, sem data = self._data - mask = self._mask + mask = self._mask.to_numpy() op = getattr(nanops, f"nan{name}") axis = kwargs.pop("axis", None) result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs) @@ -1162,9 +1176,9 @@ def _wrap_reduction_result(self, name: str, result, *, skipna, axis): if isinstance(result, np.ndarray): if skipna: # we only retain mask for all-NA rows/columns - mask = self._mask.all(axis=axis) + mask = self._mask.to_numpy().all(axis=axis) else: - mask = self._mask.any(axis=axis) + mask = self._mask.to_numpy().any(axis=axis) return self._maybe_mask_result(result, mask) return result @@ -1369,7 +1383,7 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): if skipna: return result else: - if result or len(self) == 0 or not self._mask.any(): + if result or len(self) == 0 or not self._mask.to_numpy().any(): return result else: return self.dtype.na_value @@ -1451,7 +1465,7 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): if skipna: return result else: - if not result or len(self) == 0 or not self._mask.any(): + if not result or len(self) == 0 or not self._mask.to_numpy().any(): return result else: return self.dtype.na_value From b69c00fc0956aeea94ff6e032a9ed3d578935af5 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 11 Aug 2023 05:30:35 -0400 Subject: [PATCH 002/126] removed cpplint --- .pre-commit-config.yaml | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 90627216a1354..000949c41f5a0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -70,19 +70,6 @@ repos: - id: fix-encoding-pragma args: [--remove] - id: trailing-whitespace -- repo: https://github.com/cpplint/cpplint - rev: 1.6.1 - hooks: - - id: cpplint - exclude: ^pandas/_libs/include/pandas/vendored/klib - args: [ - --quiet, - '--extensions=c,h', - '--headers=h', - --recursive, - --linelength=88, - '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' - ] - repo: https://github.com/pylint-dev/pylint rev: v3.0.0a6 hooks: From 64b0f01fb1b9042e0c3c11e75f7b6d34975df43d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 11 Aug 2023 05:35:40 -0400 Subject: [PATCH 003/126] checkpoint --- pandas/_libs/arrays.pyx | 11 +++++++---- pandas/core/arrays/masked.py | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 4b7c86a067fa5..1168948718665 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -215,10 +215,8 @@ cdef class BitMaskArray: self.array_len = len(np_array) nbytes = len(np_array) // 8 + 1 self.validity_buffer = malloc(nbytes) - # malloc def __dealloc__(self): - ... free(self.validity_buffer) def __setitem__(self, key, value): @@ -230,5 +228,10 @@ cdef class BitMaskArray: def __getitem__(self, key): bool(ArrowBitGet(self.validity_buffer, key)) - def to_numpy(self): - ... + def to_numpy(self) -> ndarray: + cdef ndarray[uint8_t] result + result = np.empty(self.array_len, dtype=bool) + for i in range(self.array_len): + result = self[i] + + return result diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8dc9e55e8a1de..14853895905f6 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -184,7 +184,7 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: # TODO: need to change this to special case multiple # indexers versus just scalar - newmask = self._mask[item] + newmask = self._mask.to_numpy()[item] if is_bool(newmask): # This is a scalar indexing if newmask: From e5238d964a6e168ecd40c03c67f58828ae9dd2b7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 11 Aug 2023 15:35:18 -0400 Subject: [PATCH 004/126] Passing test suite --- pandas/_libs/arrays.pyx | 24 +++++++++++++++---- pandas/core/arrays/masked.py | 8 +++---- pandas/tests/arrays/masked/test_arithmetic.py | 8 +++---- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 1168948718665..c8440de16bc38 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -21,6 +21,8 @@ from libc.stdlib cimport ( malloc, ) +from pandas._libs.lib import is_list_like + cdef extern from "pandas/vendored/nanoarrow.h": int8_t ArrowBitGet(const uint8_t*, int64_t) @@ -215,23 +217,35 @@ cdef class BitMaskArray: self.array_len = len(np_array) nbytes = len(np_array) // 8 + 1 self.validity_buffer = malloc(nbytes) + for index, value in enumerate(np_array): + self[index] = value def __dealloc__(self): free(self.validity_buffer) def __setitem__(self, key, value): - if value: - ArrowBitSet(self.validity_buffer, key) + if is_list_like(key): + for k in key: + if value: + ArrowBitSet(self.validity_buffer, k) + else: + ArrowBitClear(self.validity_buffer, k) else: - ArrowBitClear(self.validity_buffer, key) + if value: + ArrowBitSet(self.validity_buffer, key) + else: + ArrowBitClear(self.validity_buffer, key) def __getitem__(self, key): - bool(ArrowBitGet(self.validity_buffer, key)) + return bool(ArrowBitGet(self.validity_buffer, key)) + + def __invert__(self): + return ~self.to_numpy() def to_numpy(self) -> ndarray: cdef ndarray[uint8_t] result result = np.empty(self.array_len, dtype=bool) for i in range(self.array_len): - result = self[i] + result[i] = self[i] return result diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 14853895905f6..fb47982d3807c 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -696,7 +696,7 @@ def _propagate_mask( elif is_list_like(other) and len(other) == len(mask): mask = mask | isna(other) else: - mask = self._mask.to_numpy() | mask + mask = self._mask.to_numpy() | mask.to_numpy() # Incompatible return value type (got "Optional[ndarray[Any, dtype[bool_]]]", # expected "ndarray[Any, dtype[bool_]]") return mask # type: ignore[return-value] @@ -869,7 +869,7 @@ def _maybe_mask_result( # e.g. test_numeric_arr_mul_tdscalar_numexpr_path from pandas.core.arrays import TimedeltaArray - result[mask] = result.dtype.type("NaT") + result[mask.to_numpy()] = result.dtype.type("NaT") if not isinstance(result, TimedeltaArray): return TimedeltaArray._simple_new(result, dtype=result.dtype) @@ -882,7 +882,7 @@ def _maybe_mask_result( return IntegerArray(result, mask, copy=False) else: - result[mask] = np.nan + result[mask.to_numpy()] = np.nan return result def isna(self) -> np.ndarray: @@ -903,7 +903,7 @@ def _concat_same_type( axis: AxisInt = 0, ) -> Self: data = np.concatenate([x._data for x in to_concat], axis=axis) - mask = np.concatenate([x._mask for x in to_concat], axis=axis) + mask = np.concatenate([x._mask.to_numpy() for x in to_concat], axis=axis) return cls(data, mask) def take( diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py index f4b571ca627b3..21e292e5bbc29 100644 --- a/pandas/tests/arrays/masked/test_arithmetic.py +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -76,22 +76,22 @@ def test_array_NA(data, all_arithmetic_operators): scalar = pd.NA scalar_array = pd.array([pd.NA] * len(data), dtype=data.dtype) - mask = data._mask.copy() + mask = data._mask.to_numpy().copy() if is_bool_not_implemented(data, all_arithmetic_operators): msg = "operator '.*' not implemented for bool dtypes" with pytest.raises(NotImplementedError, match=msg): op(data, scalar) # GH#45421 check op doesn't alter data._mask inplace - tm.assert_numpy_array_equal(mask, data._mask) + tm.assert_numpy_array_equal(mask, data._mask.to_numpy()) return result = op(data, scalar) # GH#45421 check op doesn't alter data._mask inplace - tm.assert_numpy_array_equal(mask, data._mask) + tm.assert_numpy_array_equal(mask, data._mask.to_numpy()) expected = op(data, scalar_array) - tm.assert_numpy_array_equal(mask, data._mask) + tm.assert_numpy_array_equal(mask, data._mask.to_numpy()) tm.assert_extension_array_equal(result, expected) From b63b6715a48951d96128fddd97f1a2f200eb6c8e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 11 Aug 2023 15:38:45 -0400 Subject: [PATCH 005/126] revert modifications to nanoarrow --- pandas/_libs/meson.build | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index fabcd220b8b30..29b1298050619 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -62,7 +62,7 @@ libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper, _khash_primitive_helper]}, - 'arrays': {'sources': ['arrays.pyx', 'src/vendored/nanoarrow.c']}, + 'arrays': {'sources': ['arrays.pyx', 'src/vendored/nanoarrow.c'], 'includes': ['include/pandas/vendored']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, 'hashtable': {'sources': ['hashtable.pyx', _khash_primitive_helper, _hashtable_class_helper, _hashtable_func_helper]}, @@ -106,7 +106,7 @@ foreach ext_name, ext_dict : libs_sources ext_name, ext_dict.get('sources'), cython_args: ['--include-dir', meson.current_build_dir(), '-X always_allow_keywords=true'], - include_directories: [inc_np, inc_pd], + include_directories: [inc_np, inc_pd] + ext_dict.get('includes', []), dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs', install: true From fe31993dfef1ad4635f9bae307b0acdb9fd57167 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 11 Aug 2023 16:07:32 -0400 Subject: [PATCH 006/126] force vendor --- pandas/_libs/src/vendored/nanoarrow.c | 3107 +++++++++++++++++++++++++ 1 file changed, 3107 insertions(+) create mode 100644 pandas/_libs/src/vendored/nanoarrow.c diff --git a/pandas/_libs/src/vendored/nanoarrow.c b/pandas/_libs/src/vendored/nanoarrow.c new file mode 100644 index 0000000000000..7cc53b43550d7 --- /dev/null +++ b/pandas/_libs/src/vendored/nanoarrow.c @@ -0,0 +1,3107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "pandas/vendored/nanoarrow.h" + +const char* ArrowNanoarrowVersion(void) { return NANOARROW_VERSION; } + +int ArrowNanoarrowVersionInt(void) { return NANOARROW_VERSION_INT; } + +int ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { + if (error == NULL) { + return NANOARROW_OK; + } + + memset(error->message, 0, sizeof(error->message)); + + va_list args; + va_start(args, fmt); + int chars_needed = vsnprintf(error->message, sizeof(error->message), fmt, args); + va_end(args); + + if (chars_needed < 0) { + return EINVAL; + } else if (((size_t)chars_needed) >= sizeof(error->message)) { + return ERANGE; + } else { + return NANOARROW_OK; + } +} + +const char* ArrowErrorMessage(struct ArrowError* error) { + if (error == NULL) { + return ""; + } else { + return error->message; + } +} + +void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_VALIDITY; + layout->buffer_data_type[0] = NANOARROW_TYPE_BOOL; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = storage_type; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[2] = NANOARROW_TYPE_UNINITIALIZED; + + layout->element_size_bits[0] = 1; + layout->element_size_bits[1] = 0; + layout->element_size_bits[2] = 0; + + layout->child_size_elements = 0; + + switch (storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + case NANOARROW_TYPE_NA: + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[0] = NANOARROW_TYPE_UNINITIALIZED; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + layout->element_size_bits[0] = 0; + break; + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + break; + + case NANOARROW_TYPE_LARGE_LIST: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + break; + + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + break; + + case NANOARROW_TYPE_BOOL: + layout->element_size_bits[1] = 1; + break; + + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + layout->element_size_bits[1] = 8; + break; + + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_HALF_FLOAT: + layout->element_size_bits[1] = 16; + break; + + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_FLOAT: + layout->element_size_bits[1] = 32; + break; + case NANOARROW_TYPE_INTERVAL_MONTHS: + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + break; + + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + layout->element_size_bits[1] = 64; + break; + + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + layout->element_size_bits[1] = 128; + break; + + case NANOARROW_TYPE_DECIMAL256: + layout->element_size_bits[1] = 256; + break; + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY; + break; + + case NANOARROW_TYPE_DENSE_UNION: + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; + layout->element_size_bits[0] = 8; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_UNION_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + break; + + case NANOARROW_TYPE_SPARSE_UNION: + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; + layout->element_size_bits[0] = 8; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + break; + + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = storage_type; + break; + + case NANOARROW_TYPE_LARGE_STRING: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_STRING; + break; + case NANOARROW_TYPE_LARGE_BINARY: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY; + break; + + default: + break; + } +} + +void* ArrowMalloc(int64_t size) { return malloc(size); } + +void* ArrowRealloc(void* ptr, int64_t size) { return realloc(ptr, size); } + +void ArrowFree(void* ptr) { free(ptr); } + +static uint8_t* ArrowBufferAllocatorMallocReallocate( + struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, + int64_t new_size) { + return (uint8_t*)ArrowRealloc(ptr, new_size); +} + +static void ArrowBufferAllocatorMallocFree(struct ArrowBufferAllocator* allocator, + uint8_t* ptr, int64_t size) { + ArrowFree(ptr); +} + +static struct ArrowBufferAllocator ArrowBufferAllocatorMalloc = { + &ArrowBufferAllocatorMallocReallocate, &ArrowBufferAllocatorMallocFree, NULL}; + +struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void) { + return ArrowBufferAllocatorMalloc; +} + +static uint8_t* ArrowBufferAllocatorNeverReallocate( + struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, + int64_t new_size) { + return NULL; +} + +struct ArrowBufferAllocator ArrowBufferDeallocator( + void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, + int64_t size), + void* private_data) { + struct ArrowBufferAllocator allocator; + allocator.reallocate = &ArrowBufferAllocatorNeverReallocate; + allocator.free = custom_free; + allocator.private_data = private_data; + return allocator; +} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "pandas/vendored/nanoarrow.h" + +static void ArrowSchemaRelease(struct ArrowSchema* schema) { + if (schema->format != NULL) ArrowFree((void*)schema->format); + if (schema->name != NULL) ArrowFree((void*)schema->name); + if (schema->metadata != NULL) ArrowFree((void*)schema->metadata); + + // This object owns the memory for all the children, but those + // children may have been generated elsewhere and might have + // their own release() callback. + if (schema->children != NULL) { + for (int64_t i = 0; i < schema->n_children; i++) { + if (schema->children[i] != NULL) { + if (schema->children[i]->release != NULL) { + schema->children[i]->release(schema->children[i]); + } + + ArrowFree(schema->children[i]); + } + } + + ArrowFree(schema->children); + } + + // This object owns the memory for the dictionary but it + // may have been generated somewhere else and have its own + // release() callback. + if (schema->dictionary != NULL) { + if (schema->dictionary->release != NULL) { + schema->dictionary->release(schema->dictionary); + } + + ArrowFree(schema->dictionary); + } + + // private data not currently used + if (schema->private_data != NULL) { + ArrowFree(schema->private_data); + } + + schema->release = NULL; +} + +static const char* ArrowSchemaFormatTemplate(enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_UNINITIALIZED: + return NULL; + case NANOARROW_TYPE_NA: + return "n"; + case NANOARROW_TYPE_BOOL: + return "b"; + + case NANOARROW_TYPE_UINT8: + return "C"; + case NANOARROW_TYPE_INT8: + return "c"; + case NANOARROW_TYPE_UINT16: + return "S"; + case NANOARROW_TYPE_INT16: + return "s"; + case NANOARROW_TYPE_UINT32: + return "I"; + case NANOARROW_TYPE_INT32: + return "i"; + case NANOARROW_TYPE_UINT64: + return "L"; + case NANOARROW_TYPE_INT64: + return "l"; + + case NANOARROW_TYPE_HALF_FLOAT: + return "e"; + case NANOARROW_TYPE_FLOAT: + return "f"; + case NANOARROW_TYPE_DOUBLE: + return "g"; + + case NANOARROW_TYPE_STRING: + return "u"; + case NANOARROW_TYPE_LARGE_STRING: + return "U"; + case NANOARROW_TYPE_BINARY: + return "z"; + case NANOARROW_TYPE_LARGE_BINARY: + return "Z"; + + case NANOARROW_TYPE_DATE32: + return "tdD"; + case NANOARROW_TYPE_DATE64: + return "tdm"; + case NANOARROW_TYPE_INTERVAL_MONTHS: + return "tiM"; + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + return "tiD"; + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + return "tin"; + + case NANOARROW_TYPE_LIST: + return "+l"; + case NANOARROW_TYPE_LARGE_LIST: + return "+L"; + case NANOARROW_TYPE_STRUCT: + return "+s"; + case NANOARROW_TYPE_MAP: + return "+m"; + + default: + return NULL; + } +} + +static int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, + enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); + ArrowSchemaInit(schema->children[0]); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "item")); + break; + case NANOARROW_TYPE_MAP: + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); + NANOARROW_RETURN_NOT_OK( + ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_STRUCT)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "entries")); + schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE; + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema->children[0], 2)); + ArrowSchemaInit(schema->children[0]->children[0]); + ArrowSchemaInit(schema->children[0]->children[1]); + NANOARROW_RETURN_NOT_OK( + ArrowSchemaSetName(schema->children[0]->children[0], "key")); + schema->children[0]->children[0]->flags &= ~ARROW_FLAG_NULLABLE; + NANOARROW_RETURN_NOT_OK( + ArrowSchemaSetName(schema->children[0]->children[1], "value")); + break; + default: + break; + } + + return NANOARROW_OK; +} + +void ArrowSchemaInit(struct ArrowSchema* schema) { + schema->format = NULL; + schema->name = NULL; + schema->metadata = NULL; + schema->flags = ARROW_FLAG_NULLABLE; + schema->n_children = 0; + schema->children = NULL; + schema->dictionary = NULL; + schema->private_data = NULL; + schema->release = &ArrowSchemaRelease; +} + +ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type) { + // We don't allocate the dictionary because it has to be nullptr + // for non-dictionary-encoded arrays. + + // Set the format to a valid format string for type + const char* template_format = ArrowSchemaFormatTemplate(type); + + // If type isn't recognized and not explicitly unset + if (template_format == NULL && type != NANOARROW_TYPE_UNINITIALIZED) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, template_format)); + + // For types with an umabiguous child structure, allocate children + return ArrowSchemaInitChildrenIfNeeded(schema, type); +} + +ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_STRUCT)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); + for (int64_t i = 0; i < n_children; i++) { + ArrowSchemaInit(schema->children[i]); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type) { + ArrowSchemaInit(schema); + + int result = ArrowSchemaSetType(schema, type); + if (result != NANOARROW_OK) { + schema->release(schema); + return result; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, + enum ArrowType type, int32_t fixed_size) { + if (fixed_size <= 0) { + return EINVAL; + } + + char buffer[64]; + int n_chars; + switch (type) { + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + n_chars = snprintf(buffer, sizeof(buffer), "w:%d", (int)fixed_size); + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + n_chars = snprintf(buffer, sizeof(buffer), "+w:%d", (int)fixed_size); + break; + default: + return EINVAL; + } + + buffer[n_chars] = '\0'; + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, buffer)); + + if (type == NANOARROW_TYPE_FIXED_SIZE_LIST) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaInitChildrenIfNeeded(schema, type)); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, + int32_t decimal_precision, + int32_t decimal_scale) { + if (decimal_precision <= 0) { + return EINVAL; + } + + char buffer[64]; + int n_chars; + switch (type) { + case NANOARROW_TYPE_DECIMAL128: + n_chars = + snprintf(buffer, sizeof(buffer), "d:%d,%d", decimal_precision, decimal_scale); + break; + case NANOARROW_TYPE_DECIMAL256: + n_chars = snprintf(buffer, sizeof(buffer), "d:%d,%d,256", decimal_precision, + decimal_scale); + break; + default: + return EINVAL; + } + + buffer[n_chars] = '\0'; + return ArrowSchemaSetFormat(schema, buffer); +} + +static const char* ArrowTimeUnitFormatString(enum ArrowTimeUnit time_unit) { + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + return "s"; + case NANOARROW_TIME_UNIT_MILLI: + return "m"; + case NANOARROW_TIME_UNIT_MICRO: + return "u"; + case NANOARROW_TIME_UNIT_NANO: + return "n"; + default: + return NULL; + } +} + +ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, + enum ArrowTimeUnit time_unit, + const char* timezone) { + const char* time_unit_str = ArrowTimeUnitFormatString(time_unit); + if (time_unit_str == NULL) { + return EINVAL; + } + + char buffer[128]; + int n_chars; + switch (type) { + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + if (timezone != NULL) { + return EINVAL; + } + n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); + break; + case NANOARROW_TYPE_TIMESTAMP: + if (timezone == NULL) { + timezone = ""; + } + n_chars = snprintf(buffer, sizeof(buffer), "ts%s:%s", time_unit_str, timezone); + break; + case NANOARROW_TYPE_DURATION: + if (timezone != NULL) { + return EINVAL; + } + n_chars = snprintf(buffer, sizeof(buffer), "tD%s", time_unit_str); + break; + default: + return EINVAL; + } + + if (((size_t)n_chars) >= sizeof(buffer)) { + return ERANGE; + } + + buffer[n_chars] = '\0'; + + return ArrowSchemaSetFormat(schema, buffer); +} + +ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, + int64_t n_children) { + if (n_children < 0 || n_children > 127) { + return EINVAL; + } + + // Max valid size would be +ud:0,1,...126 = 401 characters + null terminator + char format_out[512]; + int64_t format_out_size = 512; + memset(format_out, 0, format_out_size); + int n_chars; + char* format_cursor = format_out; + + switch (type) { + case NANOARROW_TYPE_SPARSE_UNION: + n_chars = snprintf(format_cursor, format_out_size, "+us:"); + format_cursor += n_chars; + format_out_size -= n_chars; + break; + case NANOARROW_TYPE_DENSE_UNION: + n_chars = snprintf(format_cursor, format_out_size, "+ud:"); + format_cursor += n_chars; + format_out_size -= n_chars; + break; + default: + return EINVAL; + } + + if (n_children > 0) { + n_chars = snprintf(format_cursor, format_out_size, "0"); + format_cursor += n_chars; + format_out_size -= n_chars; + + for (int64_t i = 1; i < n_children; i++) { + n_chars = snprintf(format_cursor, format_out_size, ",%d", (int)i); + format_cursor += n_chars; + format_out_size -= n_chars; + } + } + + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, format_out)); + + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); + for (int64_t i = 0; i < n_children; i++) { + ArrowSchemaInit(schema->children[i]); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format) { + if (schema->format != NULL) { + ArrowFree((void*)schema->format); + } + + if (format != NULL) { + size_t format_size = strlen(format) + 1; + schema->format = (const char*)ArrowMalloc(format_size); + if (schema->format == NULL) { + return ENOMEM; + } + + memcpy((void*)schema->format, format, format_size); + } else { + schema->format = NULL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name) { + if (schema->name != NULL) { + ArrowFree((void*)schema->name); + } + + if (name != NULL) { + size_t name_size = strlen(name) + 1; + schema->name = (const char*)ArrowMalloc(name_size); + if (schema->name == NULL) { + return ENOMEM; + } + + memcpy((void*)schema->name, name, name_size); + } else { + schema->name = NULL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata) { + if (schema->metadata != NULL) { + ArrowFree((void*)schema->metadata); + } + + if (metadata != NULL) { + size_t metadata_size = ArrowMetadataSizeOf(metadata); + schema->metadata = (const char*)ArrowMalloc(metadata_size); + if (schema->metadata == NULL) { + return ENOMEM; + } + + memcpy((void*)schema->metadata, metadata, metadata_size); + } else { + schema->metadata = NULL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, + int64_t n_children) { + if (schema->children != NULL) { + return EEXIST; + } + + if (n_children > 0) { + schema->children = + (struct ArrowSchema**)ArrowMalloc(n_children * sizeof(struct ArrowSchema*)); + + if (schema->children == NULL) { + return ENOMEM; + } + + schema->n_children = n_children; + + memset(schema->children, 0, n_children * sizeof(struct ArrowSchema*)); + + for (int64_t i = 0; i < n_children; i++) { + schema->children[i] = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); + + if (schema->children[i] == NULL) { + return ENOMEM; + } + + schema->children[i]->release = NULL; + } + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema) { + if (schema->dictionary != NULL) { + return EEXIST; + } + + schema->dictionary = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); + if (schema->dictionary == NULL) { + return ENOMEM; + } + + schema->dictionary->release = NULL; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaDeepCopy(struct ArrowSchema* schema, + struct ArrowSchema* schema_out) { + ArrowSchemaInit(schema_out); + + int result = ArrowSchemaSetFormat(schema_out, schema->format); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + + schema_out->flags = schema->flags; + + result = ArrowSchemaSetName(schema_out, schema->name); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + + result = ArrowSchemaSetMetadata(schema_out, schema->metadata); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + + result = ArrowSchemaAllocateChildren(schema_out, schema->n_children); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + + for (int64_t i = 0; i < schema->n_children; i++) { + result = ArrowSchemaDeepCopy(schema->children[i], schema_out->children[i]); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + } + + if (schema->dictionary != NULL) { + result = ArrowSchemaAllocateDictionary(schema_out); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + + result = ArrowSchemaDeepCopy(schema->dictionary, schema_out->dictionary); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + } + + return NANOARROW_OK; +} + +static void ArrowSchemaViewSetPrimitive(struct ArrowSchemaView* schema_view, + enum ArrowType type) { + schema_view->type = type; + schema_view->storage_type = type; +} + +static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, + const char* format, + const char** format_end_out, + struct ArrowError* error) { + *format_end_out = format; + + // needed for decimal parsing + const char* parse_start; + char* parse_end; + + switch (format[0]) { + case 'n': + schema_view->type = NANOARROW_TYPE_NA; + schema_view->storage_type = NANOARROW_TYPE_NA; + *format_end_out = format + 1; + return NANOARROW_OK; + case 'b': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BOOL); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'c': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT8); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'C': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT8); + *format_end_out = format + 1; + return NANOARROW_OK; + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT16); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'S': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT16); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'i': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'I': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT32); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'l': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'L': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT64); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'e': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_HALF_FLOAT); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'f': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_FLOAT); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'g': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DOUBLE); + *format_end_out = format + 1; + return NANOARROW_OK; + + // decimal + case 'd': + if (format[1] != ':' || format[2] == '\0') { + ArrowErrorSet(error, "Expected ':precision,scale[,bitwidth]' following 'd'", + format + 3); + return EINVAL; + } + + parse_start = format + 2; + schema_view->decimal_precision = (int32_t)strtol(parse_start, &parse_end, 10); + if (parse_end == parse_start || parse_end[0] != ',') { + ArrowErrorSet(error, "Expected 'precision,scale[,bitwidth]' following 'd:'"); + return EINVAL; + } + + parse_start = parse_end + 1; + schema_view->decimal_scale = (int32_t)strtol(parse_start, &parse_end, 10); + if (parse_end == parse_start) { + ArrowErrorSet(error, "Expected 'scale[,bitwidth]' following 'd:precision,'"); + return EINVAL; + } else if (parse_end[0] != ',') { + schema_view->decimal_bitwidth = 128; + } else { + parse_start = parse_end + 1; + schema_view->decimal_bitwidth = (int32_t)strtol(parse_start, &parse_end, 10); + if (parse_start == parse_end) { + ArrowErrorSet(error, "Expected precision following 'd:precision,scale,'"); + return EINVAL; + } + } + + *format_end_out = parse_end; + + switch (schema_view->decimal_bitwidth) { + case 128: + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL128); + return NANOARROW_OK; + case 256: + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL256); + return NANOARROW_OK; + default: + ArrowErrorSet(error, "Expected decimal bitwidth of 128 or 256 but found %d", + (int)schema_view->decimal_bitwidth); + return EINVAL; + } + + // validity + data + case 'w': + schema_view->type = NANOARROW_TYPE_FIXED_SIZE_BINARY; + schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_BINARY; + if (format[1] != ':' || format[2] == '\0') { + ArrowErrorSet(error, "Expected ':' following 'w'"); + return EINVAL; + } + + schema_view->fixed_size = (int32_t)strtol(format + 2, (char**)format_end_out, 10); + return NANOARROW_OK; + + // validity + offset + data + case 'z': + schema_view->type = NANOARROW_TYPE_BINARY; + schema_view->storage_type = NANOARROW_TYPE_BINARY; + *format_end_out = format + 1; + return NANOARROW_OK; + case 'u': + schema_view->type = NANOARROW_TYPE_STRING; + schema_view->storage_type = NANOARROW_TYPE_STRING; + *format_end_out = format + 1; + return NANOARROW_OK; + + // validity + large_offset + data + case 'Z': + schema_view->type = NANOARROW_TYPE_LARGE_BINARY; + schema_view->storage_type = NANOARROW_TYPE_LARGE_BINARY; + *format_end_out = format + 1; + return NANOARROW_OK; + case 'U': + schema_view->type = NANOARROW_TYPE_LARGE_STRING; + schema_view->storage_type = NANOARROW_TYPE_LARGE_STRING; + *format_end_out = format + 1; + return NANOARROW_OK; + + // nested types + case '+': + switch (format[1]) { + // list has validity + offset or offset + case 'l': + schema_view->storage_type = NANOARROW_TYPE_LIST; + schema_view->type = NANOARROW_TYPE_LIST; + *format_end_out = format + 2; + return NANOARROW_OK; + + // large list has validity + large_offset or large_offset + case 'L': + schema_view->storage_type = NANOARROW_TYPE_LARGE_LIST; + schema_view->type = NANOARROW_TYPE_LARGE_LIST; + *format_end_out = format + 2; + return NANOARROW_OK; + + // just validity buffer + case 'w': + if (format[2] != ':' || format[3] == '\0') { + ArrowErrorSet(error, "Expected ':' following '+w'"); + return EINVAL; + } + + schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_LIST; + schema_view->type = NANOARROW_TYPE_FIXED_SIZE_LIST; + schema_view->fixed_size = + (int32_t)strtol(format + 3, (char**)format_end_out, 10); + return NANOARROW_OK; + case 's': + schema_view->storage_type = NANOARROW_TYPE_STRUCT; + schema_view->type = NANOARROW_TYPE_STRUCT; + *format_end_out = format + 2; + return NANOARROW_OK; + case 'm': + schema_view->storage_type = NANOARROW_TYPE_MAP; + schema_view->type = NANOARROW_TYPE_MAP; + *format_end_out = format + 2; + return NANOARROW_OK; + + // unions + case 'u': + switch (format[2]) { + case 'd': + schema_view->storage_type = NANOARROW_TYPE_DENSE_UNION; + schema_view->type = NANOARROW_TYPE_DENSE_UNION; + break; + case 's': + schema_view->storage_type = NANOARROW_TYPE_SPARSE_UNION; + schema_view->type = NANOARROW_TYPE_SPARSE_UNION; + break; + default: + ArrowErrorSet(error, + "Expected union format string +us: or " + "+ud: but found '%s'", + format); + return EINVAL; + } + + if (format[3] == ':') { + schema_view->union_type_ids = format + 4; + int64_t n_type_ids = + _ArrowParseUnionTypeIds(schema_view->union_type_ids, NULL); + if (n_type_ids != schema_view->schema->n_children) { + ArrowErrorSet( + error, + "Expected union type_ids parameter to be a comma-separated list of %ld " + "values between 0 and 127 but found '%s'", + (long)schema_view->schema->n_children, schema_view->union_type_ids); + return EINVAL; + } + *format_end_out = format + strlen(format); + return NANOARROW_OK; + } else { + ArrowErrorSet(error, + "Expected union format string +us: or +ud: " + "but found '%s'", + format); + return EINVAL; + } + + default: + ArrowErrorSet(error, "Expected nested type format string but found '%s'", + format); + return EINVAL; + } + + // date/time types + case 't': + switch (format[1]) { + // date + case 'd': + switch (format[2]) { + case 'D': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + schema_view->type = NANOARROW_TYPE_DATE32; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DATE64; + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet(error, "Expected 'D' or 'm' following 'td' but found '%s'", + format + 2); + return EINVAL; + } + + // time of day + case 't': + switch (format[2]) { + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + schema_view->type = NANOARROW_TYPE_TIME32; + schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + schema_view->type = NANOARROW_TYPE_TIME32; + schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIME64; + schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIME64; + schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet( + error, "Expected 's', 'm', 'u', or 'n' following 'tt' but found '%s'", + format + 2); + return EINVAL; + } + + // timestamp + case 's': + switch (format[2]) { + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; + break; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; + break; + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; + break; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; + break; + default: + ArrowErrorSet( + error, "Expected 's', 'm', 'u', or 'n' following 'ts' but found '%s'", + format + 2); + return EINVAL; + } + + if (format[3] != ':') { + ArrowErrorSet(error, "Expected ':' following '%.3s' but found '%s'", format, + format + 3); + return EINVAL; + } + + schema_view->timezone = format + 4; + *format_end_out = format + strlen(format); + return NANOARROW_OK; + + // duration + case 'D': + switch (format[2]) { + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet(error, + "Expected 's', 'm', u', or 'n' following 'tD' but found '%s'", + format + 2); + return EINVAL; + } + + // interval + case 'i': + switch (format[2]) { + case 'M': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_MONTHS); + *format_end_out = format + 3; + return NANOARROW_OK; + case 'D': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_DAY_TIME); + *format_end_out = format + 3; + return NANOARROW_OK; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, + NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO); + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet(error, + "Expected 'M', 'D', or 'n' following 'ti' but found '%s'", + format + 2); + return EINVAL; + } + + default: + ArrowErrorSet( + error, "Expected 'd', 't', 's', 'D', or 'i' following 't' but found '%s'", + format + 1); + return EINVAL; + } + + default: + ArrowErrorSet(error, "Unknown format: '%s'", format); + return EINVAL; + } +} + +static ArrowErrorCode ArrowSchemaViewValidateNChildren( + struct ArrowSchemaView* schema_view, int64_t n_children, struct ArrowError* error) { + if (n_children != -1 && schema_view->schema->n_children != n_children) { + ArrowErrorSet(error, "Expected schema with %d children but found %d children", + (int)n_children, (int)schema_view->schema->n_children); + return EINVAL; + } + + // Don't do a full validation of children but do check that they won't + // segfault if inspected + struct ArrowSchema* child; + for (int64_t i = 0; i < schema_view->schema->n_children; i++) { + child = schema_view->schema->children[i]; + if (child == NULL) { + ArrowErrorSet(error, "Expected valid schema at schema->children[%d] but found NULL", + i); + return EINVAL; + } else if (child->release == NULL) { + ArrowErrorSet( + error, + "Expected valid schema at schema->children[%d] but found a released schema", i); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowSchemaViewValidateUnion(struct ArrowSchemaView* schema_view, + struct ArrowError* error) { + return ArrowSchemaViewValidateNChildren(schema_view, -1, error); +} + +static ArrowErrorCode ArrowSchemaViewValidateMap(struct ArrowSchemaView* schema_view, + struct ArrowError* error) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewValidateNChildren(schema_view, 1, error)); + + if (schema_view->schema->children[0]->n_children != 2) { + ArrowErrorSet(error, "Expected child of map type to have 2 children but found %d", + (int)schema_view->schema->children[0]->n_children); + return EINVAL; + } + + if (strcmp(schema_view->schema->children[0]->format, "+s") != 0) { + ArrowErrorSet(error, "Expected format of child of map type to be '+s' but found '%s'", + schema_view->schema->children[0]->format); + return EINVAL; + } + + if (schema_view->schema->children[0]->flags & ARROW_FLAG_NULLABLE) { + ArrowErrorSet(error, + "Expected child of map type to be non-nullable but was nullable"); + return EINVAL; + } + + if (schema_view->schema->children[0]->children[0]->flags & ARROW_FLAG_NULLABLE) { + ArrowErrorSet(error, "Expected key of map type to be non-nullable but was nullable"); + return EINVAL; + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowSchemaViewValidateDictionary( + struct ArrowSchemaView* schema_view, struct ArrowError* error) { + // check for valid index type + switch (schema_view->storage_type) { + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + break; + default: + ArrowErrorSet( + error, + "Expected dictionary schema index type to be an integral type but found '%s'", + schema_view->schema->format); + return EINVAL; + } + + struct ArrowSchemaView dictionary_schema_view; + return ArrowSchemaViewInit(&dictionary_schema_view, schema_view->schema->dictionary, + error); +} + +static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_view, + enum ArrowType type, + struct ArrowError* error) { + switch (type) { + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_HALF_FLOAT: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + case NANOARROW_TYPE_DATE32: + case NANOARROW_TYPE_DATE64: + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + case NANOARROW_TYPE_TIMESTAMP: + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + case NANOARROW_TYPE_DURATION: + return ArrowSchemaViewValidateNChildren(schema_view, 0, error); + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + if (schema_view->fixed_size <= 0) { + ArrowErrorSet(error, "Expected size > 0 for fixed size binary but found size %d", + schema_view->fixed_size); + return EINVAL; + } + return ArrowSchemaViewValidateNChildren(schema_view, 0, error); + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return ArrowSchemaViewValidateNChildren(schema_view, 1, error); + + case NANOARROW_TYPE_STRUCT: + return ArrowSchemaViewValidateNChildren(schema_view, -1, error); + + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + return ArrowSchemaViewValidateUnion(schema_view, error); + + case NANOARROW_TYPE_MAP: + return ArrowSchemaViewValidateMap(schema_view, error); + + case NANOARROW_TYPE_DICTIONARY: + return ArrowSchemaViewValidateDictionary(schema_view, error); + + default: + ArrowErrorSet(error, "Expected a valid enum ArrowType value but found %d", + (int)schema_view->type); + return EINVAL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, + struct ArrowSchema* schema, struct ArrowError* error) { + if (schema == NULL) { + ArrowErrorSet(error, "Expected non-NULL schema"); + return EINVAL; + } + + if (schema->release == NULL) { + ArrowErrorSet(error, "Expected non-released schema"); + return EINVAL; + } + + schema_view->schema = schema; + + const char* format = schema->format; + if (format == NULL) { + ArrowErrorSet( + error, + "Error parsing schema->format: Expected a null-terminated string but found NULL"); + return EINVAL; + } + + size_t format_len = strlen(format); + if (format_len == 0) { + ArrowErrorSet(error, "Error parsing schema->format: Expected a string with size > 0"); + return EINVAL; + } + + const char* format_end_out; + ArrowErrorCode result = + ArrowSchemaViewParse(schema_view, format, &format_end_out, error); + + if (result != NANOARROW_OK) { + if (error != NULL) { + char child_error[1024]; + memcpy(child_error, ArrowErrorMessage(error), 1024); + ArrowErrorSet(error, "Error parsing schema->format: %s", child_error); + } + + return result; + } + + if ((format + format_len) != format_end_out) { + ArrowErrorSet(error, "Error parsing schema->format '%s': parsed %d/%d characters", + format, (int)(format_end_out - format), (int)(format_len)); + return EINVAL; + } + + if (schema->dictionary != NULL) { + schema_view->type = NANOARROW_TYPE_DICTIONARY; + } + + result = ArrowSchemaViewValidate(schema_view, schema_view->storage_type, error); + if (result != NANOARROW_OK) { + return result; + } + + if (schema_view->storage_type != schema_view->type) { + result = ArrowSchemaViewValidate(schema_view, schema_view->type, error); + if (result != NANOARROW_OK) { + return result; + } + } + + ArrowLayoutInit(&schema_view->layout, schema_view->storage_type); + if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_BINARY) { + schema_view->layout.element_size_bits[1] = schema_view->fixed_size * 8; + } else if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_LIST) { + schema_view->layout.child_size_elements = schema_view->fixed_size; + } + + schema_view->extension_name = ArrowCharView(NULL); + schema_view->extension_metadata = ArrowCharView(NULL); + ArrowMetadataGetValue(schema->metadata, ArrowCharView("ARROW:extension:name"), + &schema_view->extension_name); + ArrowMetadataGetValue(schema->metadata, ArrowCharView("ARROW:extension:metadata"), + &schema_view->extension_metadata); + + return NANOARROW_OK; +} + +static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_view, + char* out, int64_t n) { + const char* type_string = ArrowTypeString(schema_view->type); + switch (schema_view->type) { + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + return snprintf(out, n, "%s(%d, %d)", type_string, + (int)schema_view->decimal_precision, + (int)schema_view->decimal_scale); + case NANOARROW_TYPE_TIMESTAMP: + return snprintf(out, n, "%s('%s', '%s')", type_string, + ArrowTimeUnitString(schema_view->time_unit), schema_view->timezone); + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + case NANOARROW_TYPE_DURATION: + return snprintf(out, n, "%s('%s')", type_string, + ArrowTimeUnitString(schema_view->time_unit)); + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return snprintf(out, n, "%s(%ld)", type_string, (long)schema_view->fixed_size); + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + return snprintf(out, n, "%s([%s])", type_string, schema_view->union_type_ids); + default: + return snprintf(out, n, "%s", type_string); + } +} + +// Helper for bookeeping to emulate sprintf()-like behaviour spread +// among multiple sprintf calls. +static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, + int64_t* n_remaining, int64_t* n_chars) { + *n_chars += n_chars_last; + *n_remaining -= n_chars_last; + + // n_remaining is never less than 0 + if (*n_remaining < 0) { + *n_remaining = 0; + } + + // Can't do math on a NULL pointer + if (*out != NULL) { + *out += n_chars_last; + } +} + +int64_t ArrowSchemaToString(struct ArrowSchema* schema, char* out, int64_t n, + char recursive) { + if (schema == NULL) { + return snprintf(out, n, "[invalid: pointer is null]"); + } + + if (schema->release == NULL) { + return snprintf(out, n, "[invalid: schema is released]"); + } + + struct ArrowSchemaView schema_view; + struct ArrowError error; + + if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) { + return snprintf(out, n, "[invalid: %s]", ArrowErrorMessage(&error)); + } + + // Extension type and dictionary should include both the top-level type + // and the storage type. + int is_extension = schema_view.extension_name.size_bytes > 0; + int is_dictionary = schema->dictionary != NULL; + int64_t n_chars = 0; + int64_t n_chars_last = 0; + + // Uncommon but not technically impossible that both are true + if (is_extension && is_dictionary) { + n_chars_last = snprintf( + out, n, "%.*s{dictionary(%s)<", (int)schema_view.extension_name.size_bytes, + schema_view.extension_name.data, ArrowTypeString(schema_view.storage_type)); + } else if (is_extension) { + n_chars_last = snprintf(out, n, "%.*s{", (int)schema_view.extension_name.size_bytes, + schema_view.extension_name.data); + } else if (is_dictionary) { + n_chars_last = + snprintf(out, n, "dictionary(%s)<", ArrowTypeString(schema_view.storage_type)); + } + + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + + if (!is_dictionary) { + n_chars_last = ArrowSchemaTypeToStringInternal(&schema_view, out, n); + } else { + n_chars_last = ArrowSchemaToString(schema->dictionary, out, n, recursive); + } + + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + + if (recursive && schema->format[0] == '+') { + n_chars_last = snprintf(out, n, "<"); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + + for (int64_t i = 0; i < schema->n_children; i++) { + if (i > 0) { + n_chars_last = snprintf(out, n, ", "); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + // ArrowSchemaToStringInternal() will validate the child and print the error, + // but we need the name first + if (schema->children[i] != NULL && schema->children[i]->release != NULL && + schema->children[i]->name != NULL) { + n_chars_last = snprintf(out, n, "%s: ", schema->children[i]->name); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + n_chars_last = ArrowSchemaToString(schema->children[i], out, n, recursive); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + n_chars_last = snprintf(out, n, ">"); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + if (is_extension && is_dictionary) { + n_chars += snprintf(out, n, ">}"); + } else if (is_extension) { + n_chars += snprintf(out, n, "}"); + } else if (is_dictionary) { + n_chars += snprintf(out, n, ">"); + } + + return n_chars; +} + +ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, + const char* metadata) { + reader->metadata = metadata; + + if (reader->metadata == NULL) { + reader->offset = 0; + reader->remaining_keys = 0; + } else { + memcpy(&reader->remaining_keys, reader->metadata, sizeof(int32_t)); + reader->offset = sizeof(int32_t); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, + struct ArrowStringView* key_out, + struct ArrowStringView* value_out) { + if (reader->remaining_keys <= 0) { + return EINVAL; + } + + int64_t pos = 0; + + int32_t key_size; + memcpy(&key_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); + pos += sizeof(int32_t); + + key_out->data = reader->metadata + reader->offset + pos; + key_out->size_bytes = key_size; + pos += key_size; + + int32_t value_size; + memcpy(&value_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); + pos += sizeof(int32_t); + + value_out->data = reader->metadata + reader->offset + pos; + value_out->size_bytes = value_size; + pos += value_size; + + reader->offset += pos; + reader->remaining_keys--; + return NANOARROW_OK; +} + +int64_t ArrowMetadataSizeOf(const char* metadata) { + if (metadata == NULL) { + return 0; + } + + struct ArrowMetadataReader reader; + struct ArrowStringView key; + struct ArrowStringView value; + ArrowMetadataReaderInit(&reader, metadata); + + int64_t size = sizeof(int32_t); + while (ArrowMetadataReaderRead(&reader, &key, &value) == NANOARROW_OK) { + size += sizeof(int32_t) + key.size_bytes + sizeof(int32_t) + value.size_bytes; + } + + return size; +} + +static ArrowErrorCode ArrowMetadataGetValueInternal(const char* metadata, + struct ArrowStringView* key, + struct ArrowStringView* value_out) { + struct ArrowMetadataReader reader; + struct ArrowStringView existing_key; + struct ArrowStringView existing_value; + ArrowMetadataReaderInit(&reader, metadata); + + while (ArrowMetadataReaderRead(&reader, &existing_key, &existing_value) == + NANOARROW_OK) { + int key_equal = key->size_bytes == existing_key.size_bytes && + strncmp(key->data, existing_key.data, existing_key.size_bytes) == 0; + if (key_equal) { + value_out->data = existing_value.data; + value_out->size_bytes = existing_value.size_bytes; + break; + } + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, + struct ArrowStringView* value_out) { + if (value_out == NULL) { + return EINVAL; + } + + return ArrowMetadataGetValueInternal(metadata, &key, value_out); +} + +char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key) { + struct ArrowStringView value = ArrowCharView(NULL); + ArrowMetadataGetValue(metadata, key, &value); + return value.data != NULL; +} + +ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, + const char* metadata) { + ArrowBufferInit(buffer); + return ArrowBufferAppend(buffer, metadata, ArrowMetadataSizeOf(metadata)); +} + +static ArrowErrorCode ArrowMetadataBuilderAppendInternal(struct ArrowBuffer* buffer, + struct ArrowStringView* key, + struct ArrowStringView* value) { + if (value == NULL) { + return NANOARROW_OK; + } + + if (buffer->capacity_bytes == 0) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(buffer, 0)); + } + + if (((size_t)buffer->capacity_bytes) < sizeof(int32_t)) { + return EINVAL; + } + + int32_t n_keys; + memcpy(&n_keys, buffer->data, sizeof(int32_t)); + + int32_t key_size = (int32_t)key->size_bytes; + int32_t value_size = (int32_t)value->size_bytes; + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve( + buffer, sizeof(int32_t) + key_size + sizeof(int32_t) + value_size)); + + ArrowBufferAppendUnsafe(buffer, &key_size, sizeof(int32_t)); + ArrowBufferAppendUnsafe(buffer, key->data, key_size); + ArrowBufferAppendUnsafe(buffer, &value_size, sizeof(int32_t)); + ArrowBufferAppendUnsafe(buffer, value->data, value_size); + + n_keys++; + memcpy(buffer->data, &n_keys, sizeof(int32_t)); + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowMetadataBuilderSetInternal(struct ArrowBuffer* buffer, + struct ArrowStringView* key, + struct ArrowStringView* value) { + // Inspect the current value to see if we can avoid copying the buffer + struct ArrowStringView current_value = ArrowCharView(NULL); + NANOARROW_RETURN_NOT_OK( + ArrowMetadataGetValueInternal((const char*)buffer->data, key, ¤t_value)); + + // The key should be removed but no key exists + if (value == NULL && current_value.data == NULL) { + return NANOARROW_OK; + } + + // The key/value can be appended because no key exists + if (value != NULL && current_value.data == NULL) { + return ArrowMetadataBuilderAppendInternal(buffer, key, value); + } + + struct ArrowMetadataReader reader; + struct ArrowStringView existing_key; + struct ArrowStringView existing_value; + NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, (const char*)buffer->data)); + + struct ArrowBuffer new_buffer; + NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&new_buffer, NULL)); + + while (reader.remaining_keys > 0) { + int result = ArrowMetadataReaderRead(&reader, &existing_key, &existing_value); + if (result != NANOARROW_OK) { + ArrowBufferReset(&new_buffer); + return result; + } + + if (key->size_bytes == existing_key.size_bytes && + strncmp((const char*)key->data, (const char*)existing_key.data, + existing_key.size_bytes) == 0) { + result = ArrowMetadataBuilderAppendInternal(&new_buffer, key, value); + value = NULL; + } else { + result = + ArrowMetadataBuilderAppendInternal(&new_buffer, &existing_key, &existing_value); + } + + if (result != NANOARROW_OK) { + ArrowBufferReset(&new_buffer); + return result; + } + } + + ArrowBufferReset(buffer); + ArrowBufferMove(&new_buffer, buffer); + return NANOARROW_OK; +} + +ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value) { + return ArrowMetadataBuilderAppendInternal(buffer, &key, &value); +} + +ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value) { + return ArrowMetadataBuilderSetInternal(buffer, &key, &value); +} + +ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, + struct ArrowStringView key) { + return ArrowMetadataBuilderSetInternal(buffer, &key, NULL); +} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "pandas/vendored/nanoarrow.h" + +static void ArrowArrayRelease(struct ArrowArray* array) { + // Release buffers held by this array + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + if (private_data != NULL) { + ArrowBitmapReset(&private_data->bitmap); + ArrowBufferReset(&private_data->buffers[0]); + ArrowBufferReset(&private_data->buffers[1]); + ArrowFree(private_data); + } + + // This object owns the memory for all the children, but those + // children may have been generated elsewhere and might have + // their own release() callback. + if (array->children != NULL) { + for (int64_t i = 0; i < array->n_children; i++) { + if (array->children[i] != NULL) { + if (array->children[i]->release != NULL) { + array->children[i]->release(array->children[i]); + } + + ArrowFree(array->children[i]); + } + } + + ArrowFree(array->children); + } + + // This object owns the memory for the dictionary but it + // may have been generated somewhere else and have its own + // release() callback. + if (array->dictionary != NULL) { + if (array->dictionary->release != NULL) { + array->dictionary->release(array->dictionary); + } + + ArrowFree(array->dictionary); + } + + // Mark released + array->release = NULL; +} + +static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, + enum ArrowType storage_type) { + switch (storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + case NANOARROW_TYPE_NA: + array->n_buffers = 0; + break; + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_SPARSE_UNION: + array->n_buffers = 1; + break; + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_MAP: + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_HALF_FLOAT: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + case NANOARROW_TYPE_DENSE_UNION: + array->n_buffers = 2; + break; + + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + array->n_buffers = 3; + break; + + default: + return EINVAL; + + return NANOARROW_OK; + } + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + private_data->storage_type = storage_type; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, + enum ArrowType storage_type) { + array->length = 0; + array->null_count = 0; + array->offset = 0; + array->n_buffers = 0; + array->n_children = 0; + array->buffers = NULL; + array->children = NULL; + array->dictionary = NULL; + array->release = &ArrowArrayRelease; + array->private_data = NULL; + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)ArrowMalloc(sizeof(struct ArrowArrayPrivateData)); + if (private_data == NULL) { + array->release = NULL; + return ENOMEM; + } + + ArrowBitmapInit(&private_data->bitmap); + ArrowBufferInit(&private_data->buffers[0]); + ArrowBufferInit(&private_data->buffers[1]); + private_data->buffer_data[0] = NULL; + private_data->buffer_data[1] = NULL; + private_data->buffer_data[2] = NULL; + + array->private_data = private_data; + array->buffers = (const void**)(&private_data->buffer_data); + + int result = ArrowArraySetStorageType(array, storage_type); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } + + ArrowLayoutInit(&private_data->layout, storage_type); + // We can only know this not to be true when initializing based on a schema + // so assume this to be true. + private_data->union_type_id_is_child_index = 1; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + struct ArrowArrayView* array_view, + struct ArrowError* error) { + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowArrayInitFromType(array, array_view->storage_type), error); + int result; + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + private_data->layout = array_view->layout; + + if (array_view->n_children > 0) { + result = ArrowArrayAllocateChildren(array, array_view->n_children); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } + + for (int64_t i = 0; i < array_view->n_children; i++) { + result = + ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } + } + } + + if (array_view->dictionary != NULL) { + result = ArrowArrayAllocateDictionary(array); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } + + result = + ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, + struct ArrowSchema* schema, + struct ArrowError* error) { + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view, schema, error)); + NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromArrayView(array, &array_view, error)); + if (array_view.storage_type == NANOARROW_TYPE_DENSE_UNION || + array_view.storage_type == NANOARROW_TYPE_SPARSE_UNION) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + // We can still build arrays if this isn't true; however, the append + // functions won't work. Instead, we store this value and error only + // when StartAppending is called. + private_data->union_type_id_is_child_index = + _ArrowUnionTypeIdsWillEqualChildIndices(schema->format + 4, schema->n_children); + } + + ArrowArrayViewReset(&array_view); + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children) { + if (array->children != NULL) { + return EINVAL; + } + + if (n_children == 0) { + return NANOARROW_OK; + } + + array->children = + (struct ArrowArray**)ArrowMalloc(n_children * sizeof(struct ArrowArray*)); + if (array->children == NULL) { + return ENOMEM; + } + + memset(array->children, 0, n_children * sizeof(struct ArrowArray*)); + + for (int64_t i = 0; i < n_children; i++) { + array->children[i] = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); + if (array->children[i] == NULL) { + return ENOMEM; + } + array->children[i]->release = NULL; + } + + array->n_children = n_children; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array) { + if (array->dictionary != NULL) { + return EINVAL; + } + + array->dictionary = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); + if (array->dictionary == NULL) { + return ENOMEM; + } + + array->dictionary->release = NULL; + return NANOARROW_OK; +} + +void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + ArrowBufferMove(&bitmap->buffer, &private_data->bitmap.buffer); + private_data->bitmap.size_bits = bitmap->size_bits; + bitmap->size_bits = 0; + private_data->buffer_data[0] = private_data->bitmap.buffer.data; + array->null_count = -1; +} + +ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, + struct ArrowBuffer* buffer) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + switch (i) { + case 0: + ArrowBufferMove(buffer, &private_data->bitmap.buffer); + private_data->buffer_data[i] = private_data->bitmap.buffer.data; + break; + case 1: + case 2: + ArrowBufferMove(buffer, &private_data->buffers[i - 1]); + private_data->buffer_data[i] = private_data->buffers[i - 1].data; + break; + default: + return EINVAL; + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_view, + struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + ArrowArrayViewInitFromType(array_view, private_data->storage_type); + array_view->layout = private_data->layout; + array_view->array = array; + array_view->length = array->length; + array_view->offset = array->offset; + array_view->null_count = array->null_count; + + array_view->buffer_views[0].data.as_uint8 = private_data->bitmap.buffer.data; + array_view->buffer_views[0].size_bytes = private_data->bitmap.buffer.size_bytes; + array_view->buffer_views[1].data.as_uint8 = private_data->buffers[0].data; + array_view->buffer_views[1].size_bytes = private_data->buffers[0].size_bytes; + array_view->buffer_views[2].data.as_uint8 = private_data->buffers[1].data; + array_view->buffer_views[2].size_bytes = private_data->buffers[1].size_bytes; + + int result = ArrowArrayViewAllocateChildren(array_view, array->n_children); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + for (int64_t i = 0; i < array->n_children; i++) { + result = ArrowArrayViewInitFromArray(array_view->children[i], array->children[i]); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + if (array->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowArrayReserveInternal(struct ArrowArray* array, + struct ArrowArrayView* array_view) { + // Loop through buffers and reserve the extra space that we know about + for (int64_t i = 0; i < array->n_buffers; i++) { + // Don't reserve on a validity buffer that hasn't been allocated yet + if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && + ArrowArrayBuffer(array, i)->data == NULL) { + continue; + } + + int64_t additional_size_bytes = + array_view->buffer_views[i].size_bytes - ArrowArrayBuffer(array, i)->size_bytes; + + if (additional_size_bytes > 0) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferReserve(ArrowArrayBuffer(array, i), additional_size_bytes)); + } + } + + // Recursively reserve children + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayReserveInternal(array->children[i], array_view->children[i])); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, + int64_t additional_size_elements) { + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array)); + + // Calculate theoretical buffer sizes (recursively) + ArrowArrayViewSetLength(&array_view, array->length + additional_size_elements); + + // Walk the structure (recursively) + int result = ArrowArrayReserveInternal(array, &array_view); + ArrowArrayViewReset(&array_view); + if (result != NANOARROW_OK) { + return result; + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + // The only buffer finalizing this currently does is make sure the data + // buffer for (Large)String|Binary is never NULL + switch (private_data->storage_type) { + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + case NANOARROW_TYPE_LARGE_STRING: + if (ArrowArrayBuffer(array, 2)->data == NULL) { + ArrowBufferAppendUInt8(ArrowArrayBuffer(array, 2), 0); + } + break; + default: + break; + } + + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary)); + } + + return NANOARROW_OK; +} + +static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + for (int64_t i = 0; i < 3; i++) { + private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; + } + + for (int64_t i = 0; i < array->n_children; i++) { + ArrowArrayFlushInternalPointers(array->children[i]); + } + + if (array->dictionary != NULL) { + ArrowArrayFlushInternalPointers(array->dictionary); + } +} + +ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, + enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + // Even if the data buffer is size zero, the pointer value needed to be non-null + // in some implementations (at least one version of Arrow C++ at the time this + // was added). Only do this fix if we can assume CPU data access. + if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) { + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), error); + } + + // Make sure the value we get with array->buffers[i] is set to the actual + // pointer (which may have changed from the original due to reallocation) + ArrowArrayFlushInternalPointers(array); + + if (validation_level == NANOARROW_VALIDATION_LEVEL_NONE) { + return NANOARROW_OK; + } + + // For validation, initialize an ArrowArrayView with our known buffer sizes + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view, array), + error); + int result = ArrowArrayViewValidate(&array_view, validation_level, error); + ArrowArrayViewReset(&array_view); + return result; +} + +ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, + struct ArrowError* error) { + return ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_DEFAULT, error); +} + +void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, + enum ArrowType storage_type) { + memset(array_view, 0, sizeof(struct ArrowArrayView)); + array_view->storage_type = storage_type; + ArrowLayoutInit(&array_view->layout, storage_type); +} + +ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, + int64_t n_children) { + if (array_view->children != NULL) { + return EINVAL; + } + + array_view->children = + (struct ArrowArrayView**)ArrowMalloc(n_children * sizeof(struct ArrowArrayView*)); + if (array_view->children == NULL) { + return ENOMEM; + } + + for (int64_t i = 0; i < n_children; i++) { + array_view->children[i] = NULL; + } + + array_view->n_children = n_children; + + for (int64_t i = 0; i < n_children; i++) { + array_view->children[i] = + (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); + if (array_view->children[i] == NULL) { + return ENOMEM; + } + ArrowArrayViewInitFromType(array_view->children[i], NANOARROW_TYPE_UNINITIALIZED); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) { + if (array_view->dictionary != NULL) { + return EINVAL; + } + + array_view->dictionary = + (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); + if (array_view->dictionary == NULL) { + return ENOMEM; + } + + ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED); + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, + struct ArrowSchema* schema, + struct ArrowError* error) { + struct ArrowSchemaView schema_view; + int result = ArrowSchemaViewInit(&schema_view, schema, error); + if (result != NANOARROW_OK) { + return result; + } + + ArrowArrayViewInitFromType(array_view, schema_view.storage_type); + array_view->layout = schema_view.layout; + + result = ArrowArrayViewAllocateChildren(array_view, schema->n_children); + if (result != NANOARROW_OK) { + ArrowErrorSet(error, "ArrowArrayViewAllocateChildren() failed"); + ArrowArrayViewReset(array_view); + return result; + } + + for (int64_t i = 0; i < schema->n_children; i++) { + result = + ArrowArrayViewInitFromSchema(array_view->children[i], schema->children[i], error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + if (schema->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = + ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION || + array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) { + array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t)); + if (array_view->union_type_id_map == NULL) { + return ENOMEM; + } + + memset(array_view->union_type_id_map, -1, 256); + int8_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids, + array_view->union_type_id_map + 128); + for (int8_t child_index = 0; child_index < n_type_ids; child_index++) { + int8_t type_id = array_view->union_type_id_map[128 + child_index]; + array_view->union_type_id_map[type_id] = child_index; + } + } + + return NANOARROW_OK; +} + +void ArrowArrayViewReset(struct ArrowArrayView* array_view) { + if (array_view->children != NULL) { + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i] != NULL) { + ArrowArrayViewReset(array_view->children[i]); + ArrowFree(array_view->children[i]); + } + } + + ArrowFree(array_view->children); + } + + if (array_view->dictionary != NULL) { + ArrowArrayViewReset(array_view->dictionary); + ArrowFree(array_view->dictionary); + } + + if (array_view->union_type_id_map != NULL) { + ArrowFree(array_view->union_type_id_map); + } + + ArrowArrayViewInitFromType(array_view, NANOARROW_TYPE_UNINITIALIZED); +} + +void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) { + for (int i = 0; i < 3; i++) { + int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; + + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: + array_view->buffer_views[i].size_bytes = _ArrowBytesForBits(length); + continue; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Probably don't want/need to rely on the producer to have allocated an + // offsets buffer of length 1 for a zero-size array + array_view->buffer_views[i].size_bytes = + (length != 0) * element_size_bytes * (length + 1); + continue; + case NANOARROW_BUFFER_TYPE_DATA: + array_view->buffer_views[i].size_bytes = + _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * length) / + 8; + continue; + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + array_view->buffer_views[i].size_bytes = element_size_bytes * length; + continue; + case NANOARROW_BUFFER_TYPE_NONE: + array_view->buffer_views[i].size_bytes = 0; + continue; + } + } + + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_SPARSE_UNION: + for (int64_t i = 0; i < array_view->n_children; i++) { + ArrowArrayViewSetLength(array_view->children[i], length); + } + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + if (array_view->n_children >= 1) { + ArrowArrayViewSetLength(array_view->children[0], + length * array_view->layout.child_size_elements); + } + default: + break; + } +} + +// This version recursively extracts information from the array and stores it +// in the array view, performing any checks that require the original array. +static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, + struct ArrowArray* array, + struct ArrowError* error) { + // Check length and offset + if (array->offset < 0) { + ArrowErrorSet(error, "Expected array offset >= 0 but found array offset of %ld", + (long)array->offset); + return EINVAL; + } + + if (array->length < 0) { + ArrowErrorSet(error, "Expected array length >= 0 but found array length of %ld", + (long)array->length); + return EINVAL; + } + + array_view->array = array; + array_view->offset = array->offset; + array_view->length = array->length; + array_view->null_count = array->null_count; + + int64_t buffers_required = 0; + for (int i = 0; i < 3; i++) { + if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { + break; + } + + buffers_required++; + + // Set buffer pointer + array_view->buffer_views[i].data.data = array->buffers[i]; + + // If non-null, set buffer size to unknown. + if (array->buffers[i] == NULL) { + array_view->buffer_views[i].size_bytes = 0; + } else { + array_view->buffer_views[i].size_bytes = -1; + } + } + + // Check the number of buffers + if (buffers_required != array->n_buffers) { + ArrowErrorSet(error, "Expected array with %d buffer(s) but found %d buffer(s)", + (int)buffers_required, (int)array->n_buffers); + return EINVAL; + } + + // Check number of children + if (array_view->n_children != array->n_children) { + ArrowErrorSet(error, "Expected %ld children but found %ld children", + (long)array_view->n_children, (long)array->n_children); + return EINVAL; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view->children[i], + array->children[i], error)); + } + + // Check dictionary + if (array->dictionary == NULL && array_view->dictionary != NULL) { + ArrowErrorSet(error, "Expected dictionary but found NULL"); + return EINVAL; + } + + if (array->dictionary != NULL && array_view->dictionary == NULL) { + ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member"); + return EINVAL; + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, + struct ArrowError* error) { + // Calculate buffer sizes that do not require buffer access. If marked as + // unknown, assign the buffer size; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + + // Only loop over the first two buffers because the size of the third buffer + // is always data dependent for all current Arrow types. + for (int i = 0; i < 2; i++) { + int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; + // Initialize with a value that will cause an error if accidentally used uninitialized + int64_t min_buffer_size_bytes = array_view->buffer_views[i].size_bytes + 1; + + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: + if (array_view->null_count == 0 && array_view->buffer_views[i].size_bytes == 0) { + continue; + } + + min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length); + break; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Probably don't want/need to rely on the producer to have allocated an + // offsets buffer of length 1 for a zero-size array + min_buffer_size_bytes = + (offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1); + break; + case NANOARROW_BUFFER_TYPE_DATA: + min_buffer_size_bytes = + _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * + offset_plus_length) / + 8; + break; + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + min_buffer_size_bytes = element_size_bytes * offset_plus_length; + break; + case NANOARROW_BUFFER_TYPE_NONE: + continue; + } + + // Assign or validate buffer size + if (array_view->buffer_views[i].size_bytes == -1) { + array_view->buffer_views[i].size_bytes = min_buffer_size_bytes; + } else if (array_view->buffer_views[i].size_bytes < min_buffer_size_bytes) { + ArrowErrorSet(error, + "Expected %s array buffer %d to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (int)i, + (long)min_buffer_size_bytes, + (long)array_view->buffer_views[i].size_bytes); + return EINVAL; + } + } + + // For list, fixed-size list and map views, we can validate the number of children + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_MAP: + if (array_view->n_children != 1) { + ArrowErrorSet(error, "Expected 1 child of %s array but found %ld child arrays", + ArrowTypeString(array_view->storage_type), + (long)array_view->n_children); + return EINVAL; + } + default: + break; + } + + // For struct, the sparse union, and the fixed-size list views, we can validate child + // lengths. + int64_t child_min_length; + switch (array_view->storage_type) { + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_STRUCT: + child_min_length = (array_view->offset + array_view->length); + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i]->length < child_min_length) { + ArrowErrorSet( + error, + "Expected struct child %d to have length >= %ld but found child with " + "length %ld", + (int)(i + 1), (long)(child_min_length), + (long)array_view->children[i]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_min_length = (array_view->offset + array_view->length) * + array_view->layout.child_size_elements; + if (array_view->children[0]->length < child_min_length) { + ArrowErrorSet(error, + "Expected child of fixed_size_list array to have length >= %ld but " + "found array with length %ld", + (long)child_min_length, (long)array_view->children[0]->length); + return EINVAL; + } + break; + default: + break; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewValidateMinimal(array_view->children[i], error)); + } + + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, + struct ArrowError* error) { + // Perform minimal validation. This will validate or assign + // buffer sizes as long as buffer access is not required. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + + // Calculate buffer sizes or child lengths that require accessing the offsets + // buffer. Where appropriate, validate that the first offset is >= 0. + // If a buffer size is marked as unknown, assign it; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + + int64_t first_offset; + int64_t last_offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int32[0]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", + (long)first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->buffer_views[2].size_bytes); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int64[0]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", + (long)first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->buffer_views[2].size_bytes); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i]->length < offset_plus_length) { + ArrowErrorSet( + error, + "Expected struct child %d to have length >= %ld but found child with " + "length %ld", + (int)(i + 1), (long)offset_plus_length, + (long)array_view->children[i]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int32[0]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", + (long)first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + if (array_view->children[0]->length < last_offset) { + ArrowErrorSet( + error, + "Expected child of %s array to have length >= %ld but found array with " + "length %ld", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->children[0]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_LARGE_LIST: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int64[0]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", + (long)first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + if (array_view->children[0]->length < last_offset) { + ArrowErrorSet( + error, + "Expected child of large list array to have length >= %ld but found array " + "with length %ld", + (long)last_offset, (long)array_view->children[0]->length); + return EINVAL; + } + } + break; + default: + break; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewValidateDefault(array_view->children[i], error)); + } + + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, + struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + + return NANOARROW_OK; +} + +static int ArrowAssertIncreasingInt32(struct ArrowBufferView view, + struct ArrowError* error) { + if (view.size_bytes <= (int64_t)sizeof(int32_t)) { + return NANOARROW_OK; + } + + for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) { + int32_t diff = view.data.as_int32[i] - view.data.as_int32[i - 1]; + if (diff < 0) { + ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element size %ld", + (long)i, (long)diff); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowAssertIncreasingInt64(struct ArrowBufferView view, + struct ArrowError* error) { + if (view.size_bytes <= (int64_t)sizeof(int64_t)) { + return NANOARROW_OK; + } + + for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) { + int64_t diff = view.data.as_int64[i] - view.data.as_int64[i - 1]; + if (diff < 0) { + ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element size %ld", + (long)i, (long)diff); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowAssertRangeInt8(struct ArrowBufferView view, int8_t min_value, + int8_t max_value, struct ArrowError* error) { + for (int64_t i = 0; i < view.size_bytes; i++) { + if (view.data.as_int8[i] < min_value || view.data.as_int8[i] > max_value) { + ArrowErrorSet(error, + "[%ld] Expected buffer value between %d and %d but found value %d", + (long)i, (int)min_value, (int)max_value, (int)view.data.as_int8[i]); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, + int64_t n_values, struct ArrowError* error) { + for (int64_t i = 0; i < view.size_bytes; i++) { + int item_found = 0; + for (int64_t j = 0; j < n_values; j++) { + if (view.data.as_int8[i] == values[j]) { + item_found = 1; + break; + } + } + + if (!item_found) { + ArrowErrorSet(error, "[%ld] Unexpected buffer value %d", (long)i, + (int)view.data.as_int8[i]); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, + struct ArrowError* error) { + for (int i = 0; i < 3; i++) { + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + if (array_view->layout.element_size_bits[i] == 32) { + NANOARROW_RETURN_NOT_OK( + ArrowAssertIncreasingInt32(array_view->buffer_views[i], error)); + } else { + NANOARROW_RETURN_NOT_OK( + ArrowAssertIncreasingInt64(array_view->buffer_views[i], error)); + } + break; + default: + break; + } + } + + if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION || + array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) { + if (array_view->union_type_id_map == NULL) { + // If the union_type_id map is NULL (e.g., when using ArrowArrayInitFromType() + + // ArrowArrayAllocateChildren() + ArrowArrayFinishBuilding()), we don't have enough + // information to validate this buffer. + ArrowErrorSet(error, + "Insufficient information provided for validation of union array"); + return EINVAL; + } else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices( + array_view->union_type_id_map, array_view->n_children, + array_view->n_children)) { + NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8( + array_view->buffer_views[0], 0, (int8_t)(array_view->n_children - 1), error)); + } else { + NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(array_view->buffer_views[0], + array_view->union_type_id_map + 128, + array_view->n_children, error)); + } + } + + if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION && + array_view->union_type_id_map != NULL) { + // Check that offsets refer to child elements that actually exist + for (int64_t i = 0; i < array_view->length; i++) { + int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i); + int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i); + int64_t child_length = array_view->children[child_id]->length; + if (offset < 0 || offset > child_length) { + ArrowErrorSet( + error, + "[%ld] Expected union offset for child id %d to be between 0 and %ld but " + "found offset value %ld", + (long)i, (int)child_id, (long)child_length, offset); + return EINVAL; + } + } + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error)); + } + + // Dictionary valiation not implemented + if (array_view->dictionary != NULL) { + ArrowErrorSet(error, "Validation for dictionary-encoded arrays is not implemented"); + return ENOTSUP; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + switch (validation_level) { + case NANOARROW_VALIDATION_LEVEL_NONE: + return NANOARROW_OK; + case NANOARROW_VALIDATION_LEVEL_MINIMAL: + return ArrowArrayViewValidateMinimal(array_view, error); + case NANOARROW_VALIDATION_LEVEL_DEFAULT: + return ArrowArrayViewValidateDefault(array_view, error); + case NANOARROW_VALIDATION_LEVEL_FULL: + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + return ArrowArrayViewValidateFull(array_view, error); + } + + ArrowErrorSet(error, "validation_level not recognized"); + return EINVAL; +} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "nanoarrow.h" + +struct BasicArrayStreamPrivate { + struct ArrowSchema schema; + int64_t n_arrays; + struct ArrowArray* arrays; + int64_t arrays_i; +}; + +static int ArrowBasicArrayStreamGetSchema(struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema) { + if (array_stream == NULL || array_stream->release == NULL) { + return EINVAL; + } + + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + return ArrowSchemaDeepCopy(&private_data->schema, schema); +} + +static int ArrowBasicArrayStreamGetNext(struct ArrowArrayStream* array_stream, + struct ArrowArray* array) { + if (array_stream == NULL || array_stream->release == NULL) { + return EINVAL; + } + + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + + if (private_data->arrays_i == private_data->n_arrays) { + array->release = NULL; + return NANOARROW_OK; + } + + ArrowArrayMove(&private_data->arrays[private_data->arrays_i++], array); + return NANOARROW_OK; +} + +static const char* ArrowBasicArrayStreamGetLastError( + struct ArrowArrayStream* array_stream) { + return NULL; +} + +static void ArrowBasicArrayStreamRelease(struct ArrowArrayStream* array_stream) { + if (array_stream == NULL || array_stream->release == NULL) { + return; + } + + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + + if (private_data->schema.release != NULL) { + private_data->schema.release(&private_data->schema); + } + + for (int64_t i = 0; i < private_data->n_arrays; i++) { + if (private_data->arrays[i].release != NULL) { + private_data->arrays[i].release(&private_data->arrays[i]); + } + } + + if (private_data->arrays != NULL) { + ArrowFree(private_data->arrays); + } + + ArrowFree(private_data); + array_stream->release = NULL; +} + +ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema, int64_t n_arrays) { + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)ArrowMalloc( + sizeof(struct BasicArrayStreamPrivate)); + if (private_data == NULL) { + return ENOMEM; + } + + ArrowSchemaMove(schema, &private_data->schema); + + private_data->n_arrays = n_arrays; + private_data->arrays = NULL; + private_data->arrays_i = 0; + + if (n_arrays > 0) { + private_data->arrays = + (struct ArrowArray*)ArrowMalloc(n_arrays * sizeof(struct ArrowArray)); + if (private_data->arrays == NULL) { + ArrowBasicArrayStreamRelease(array_stream); + return ENOMEM; + } + } + + for (int64_t i = 0; i < private_data->n_arrays; i++) { + private_data->arrays[i].release = NULL; + } + + array_stream->get_schema = &ArrowBasicArrayStreamGetSchema; + array_stream->get_next = &ArrowBasicArrayStreamGetNext; + array_stream->get_last_error = ArrowBasicArrayStreamGetLastError; + array_stream->release = ArrowBasicArrayStreamRelease; + array_stream->private_data = private_data; + return NANOARROW_OK; +} + +void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, + struct ArrowArray* array) { + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + ArrowArrayMove(array, &private_data->arrays[i]); +} + +ArrowErrorCode ArrowBasicArrayStreamValidate(struct ArrowArrayStream* array_stream, + struct ArrowError* error) { + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewInitFromSchema(&array_view, &private_data->schema, error)); + + for (int64_t i = 0; i < private_data->n_arrays; i++) { + if (private_data->arrays[i].release != NULL) { + int result = ArrowArrayViewSetArray(&array_view, &private_data->arrays[i], error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(&array_view); + return result; + } + } + } + + ArrowArrayViewReset(&array_view); + return NANOARROW_OK; +} From a39581bc65259fcd8b2c6c5158869562c90f1047 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 11 Aug 2023 17:59:14 -0400 Subject: [PATCH 007/126] more to_numpy adds --- pandas/tests/arrays/boolean/test_comparison.py | 2 +- pandas/tests/arrays/boolean/test_construction.py | 4 ++-- pandas/tests/arrays/boolean/test_function.py | 14 +++++++------- pandas/tests/arrays/boolean/test_logical.py | 4 ++-- pandas/tests/arrays/categorical/test_astype.py | 1 + pandas/tests/arrays/floating/test_arithmetic.py | 2 +- pandas/tests/arrays/floating/test_construction.py | 6 +++--- pandas/tests/arrays/integer/test_arithmetic.py | 2 +- pandas/tests/arrays/integer/test_construction.py | 4 ++-- pandas/tests/arrays/integer/test_dtypes.py | 2 +- pandas/tests/arrays/integer/test_function.py | 2 +- pandas/tests/arrays/masked_shared.py | 8 ++++---- pandas/tests/indexes/test_old_base.py | 6 ++++-- 13 files changed, 30 insertions(+), 27 deletions(-) diff --git a/pandas/tests/arrays/boolean/test_comparison.py b/pandas/tests/arrays/boolean/test_comparison.py index 2eeb9da574b1e..b6f04deca1a7b 100644 --- a/pandas/tests/arrays/boolean/test_comparison.py +++ b/pandas/tests/arrays/boolean/test_comparison.py @@ -46,7 +46,7 @@ def test_array(self, comparison_op): result = op(a, b) values = op(a._data, b._data) - mask = a._mask | b._mask + mask = a._mask.to_numpy() | b._mask.to_numpy() expected = BooleanArray(values, mask) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index d26eea19c06e9..cb24a9dd778a0 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -40,11 +40,11 @@ def test_boolean_array_constructor_copy(): result = BooleanArray(values, mask) assert result._data is values - assert result._mask is mask + # assert result._mask is mask result = BooleanArray(values, mask, copy=True) assert result._data is not values - assert result._mask is not mask + # assert result._mask is not mask def test_to_boolean_array(): diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 2b3f3d3d16ac6..bf51035678db7 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -13,36 +13,36 @@ def test_ufuncs_binary(ufunc): a = pd.array([True, False, None], dtype="boolean") result = ufunc(a, a) expected = pd.array(ufunc(a._data, a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) s = pd.Series(a) result = ufunc(s, a) expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_series_equal(result, expected) # Boolean with numpy array arr = np.array([True, True, False]) result = ufunc(a, arr) expected = pd.array(ufunc(a._data, arr), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) result = ufunc(arr, a) expected = pd.array(ufunc(arr, a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) # BooleanArray with scalar result = ufunc(a, True) expected = pd.array(ufunc(a._data, True), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) result = ufunc(True, a) expected = pd.array(ufunc(True, a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) # not handled types @@ -56,7 +56,7 @@ def test_ufuncs_unary(ufunc): a = pd.array([True, False, None], dtype="boolean") result = ufunc(a) expected = pd.array(ufunc(a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) ser = pd.Series(a) diff --git a/pandas/tests/arrays/boolean/test_logical.py b/pandas/tests/arrays/boolean/test_logical.py index 66c117ea3fc66..4cdaf3a90b21d 100644 --- a/pandas/tests/arrays/boolean/test_logical.py +++ b/pandas/tests/arrays/boolean/test_logical.py @@ -238,8 +238,8 @@ def test_no_masked_assumptions(self, other, all_logical_operators): tm.assert_extension_array_equal(result, expected) if isinstance(other, BooleanArray): - other._data[other._mask] = True - a._data[a._mask] = False + other._data[other._mask.to_numpy()] = True + a._data[a._mask.to_numpy()] = False result = getattr(a, all_logical_operators)(other) expected = getattr(b, all_logical_operators)(other) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index d2f9f6dffab49..a7d5ecda3c644 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -146,6 +146,7 @@ def test_astype_object_timestamp_categories(self): expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object") tm.assert_numpy_array_equal(result, expected) + @pytest.skip("not applicable with bitmask") def test_astype_category_readonly_mask_values(self): # GH#53658 arr = array([0, 1, 2], dtype="Int64") diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 056c22d8c1131..8ee291d6fd6f5 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -67,7 +67,7 @@ def test_pow_scalar(dtype): # TODO np.nan should be converted to pd.NA / missing before operation? expected = FloatingArray( np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), - mask=a._mask, + mask=a._mask.to_numpy(), ) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 4007ee6b415c9..699153b2c0639 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -23,7 +23,7 @@ def test_floating_array_constructor(): expected = pd.array([1, 2, 3, np.nan], dtype="Float64") tm.assert_extension_array_equal(result, expected) tm.assert_numpy_array_equal(result._data, values) - tm.assert_numpy_array_equal(result._mask, mask) + tm.assert_numpy_array_equal(result._mask.to_numpy(), mask) msg = r".* should be .* numpy array. Use the 'pd.array' function instead" with pytest.raises(TypeError, match=msg): @@ -62,11 +62,11 @@ def test_floating_array_constructor_copy(): result = FloatingArray(values, mask) assert result._data is values - assert result._mask is mask + # assert result._mask is mask result = FloatingArray(values, mask, copy=True) assert result._data is not values - assert result._mask is not mask + # assert result._mask is not mask def test_to_array(): diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index ce6c245cd0f37..286d884994c44 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -248,7 +248,7 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators): # rmod results in NaN that wasn't NA in original nullable Series -> unmask it if all_arithmetic_operators == "__rmod__": mask = (s == 0).fillna(False).to_numpy(bool) - expected.array._mask[mask] = False + expected.array._mask[mask.to_numpy()] = False tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index 9ecfc51cb2208..f6ef5db17044b 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -100,11 +100,11 @@ def test_integer_array_constructor_copy(): result = IntegerArray(values, mask) assert result._data is values - assert result._mask is mask + # assert result._mask is mask result = IntegerArray(values, mask, copy=True) assert result._data is not values - assert result._mask is not mask + # assert result._mask is not mask @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index f50b4cfd0b520..312fa90844847 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -163,7 +163,7 @@ def test_astype_copy(): result = arr.astype("Int64", copy=False) assert result is arr assert np.shares_memory(result._data, arr._data) - assert np.shares_memory(result._mask, arr._mask) + # assert np.shares_memory(result._mask, arr._mask) result[0] = 10 assert arr[0] == 10 result[0] = pd.NA diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index d48b636a98feb..40c9dcc697f46 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -26,7 +26,7 @@ def test_ufuncs_single_float(ufunc): a = pd.array([1, 2, -3, np.nan]) with np.errstate(invalid="ignore"): result = ufunc(a) - expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask) + expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask.to_numpy()) tm.assert_extension_array_equal(result, expected) s = pd.Series(a) diff --git a/pandas/tests/arrays/masked_shared.py b/pandas/tests/arrays/masked_shared.py index 3e74402263cf9..7adaf5f0a5859 100644 --- a/pandas/tests/arrays/masked_shared.py +++ b/pandas/tests/arrays/masked_shared.py @@ -16,7 +16,7 @@ def _compare_other(self, data, op, other): expected = pd.Series(op(data._data, other), dtype="boolean") # fill the nan locations - expected[data._mask] = pd.NA + expected[data._mask.to_numpy()] = pd.NA tm.assert_series_equal(result, expected) @@ -28,7 +28,7 @@ def _compare_other(self, data, op, other): expected = op(pd.Series(data._data), other).astype("boolean") # fill the nan locations - expected[data._mask] = pd.NA + expected[data._mask.to_numpy()] = pd.NA tm.assert_series_equal(result, expected) @@ -43,7 +43,7 @@ def test_scalar(self, other, comparison_op, dtype): expected = pd.array([None, None, None], dtype="boolean") else: values = op(left._data, other) - expected = pd.arrays.BooleanArray(values, left._mask, copy=True) + expected = pd.arrays.BooleanArray(values, left._mask.to_numpy(), copy=True) tm.assert_extension_array_equal(result, expected) # ensure we haven't mutated anything inplace @@ -74,7 +74,7 @@ def test_array(self, comparison_op, dtype): result = op(left, right) values = op(left._data, right._data) - mask = left._mask | right._mask + mask = left._mask.to_numpy() | right._mask.to_numpy() expected = pd.arrays.BooleanArray(values, mask) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 3b627f2fae845..eb7b2f585b426 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -277,9 +277,11 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._data, result._values._data, check_same="same" ) - assert np.shares_memory(index._values._mask, result._values._mask) + # assert np.shares_memory(index._values._mask, result._values._mask) tm.assert_numpy_array_equal( - index._values._mask, result._values._mask, check_same="same" + index._values._mask.to_numpy(), + result._values._mask.to_numpy(), + check_same="same", ) elif index.dtype == "string[python]": assert np.shares_memory(index._values._ndarray, result._values._ndarray) From cb1b2740c79f81bf0c77f9ba282ce780275367ca Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 12 Aug 2023 08:06:04 -0400 Subject: [PATCH 008/126] Revert "more to_numpy adds" This reverts commit a39581bc65259fcd8b2c6c5158869562c90f1047. --- pandas/tests/arrays/boolean/test_comparison.py | 2 +- pandas/tests/arrays/boolean/test_construction.py | 4 ++-- pandas/tests/arrays/boolean/test_function.py | 14 +++++++------- pandas/tests/arrays/boolean/test_logical.py | 4 ++-- pandas/tests/arrays/categorical/test_astype.py | 1 - pandas/tests/arrays/floating/test_arithmetic.py | 2 +- pandas/tests/arrays/floating/test_construction.py | 6 +++--- pandas/tests/arrays/integer/test_arithmetic.py | 2 +- pandas/tests/arrays/integer/test_construction.py | 4 ++-- pandas/tests/arrays/integer/test_dtypes.py | 2 +- pandas/tests/arrays/integer/test_function.py | 2 +- pandas/tests/arrays/masked_shared.py | 8 ++++---- pandas/tests/indexes/test_old_base.py | 6 ++---- 13 files changed, 27 insertions(+), 30 deletions(-) diff --git a/pandas/tests/arrays/boolean/test_comparison.py b/pandas/tests/arrays/boolean/test_comparison.py index b6f04deca1a7b..2eeb9da574b1e 100644 --- a/pandas/tests/arrays/boolean/test_comparison.py +++ b/pandas/tests/arrays/boolean/test_comparison.py @@ -46,7 +46,7 @@ def test_array(self, comparison_op): result = op(a, b) values = op(a._data, b._data) - mask = a._mask.to_numpy() | b._mask.to_numpy() + mask = a._mask | b._mask expected = BooleanArray(values, mask) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index cb24a9dd778a0..d26eea19c06e9 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -40,11 +40,11 @@ def test_boolean_array_constructor_copy(): result = BooleanArray(values, mask) assert result._data is values - # assert result._mask is mask + assert result._mask is mask result = BooleanArray(values, mask, copy=True) assert result._data is not values - # assert result._mask is not mask + assert result._mask is not mask def test_to_boolean_array(): diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index bf51035678db7..2b3f3d3d16ac6 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -13,36 +13,36 @@ def test_ufuncs_binary(ufunc): a = pd.array([True, False, None], dtype="boolean") result = ufunc(a, a) expected = pd.array(ufunc(a._data, a._data), dtype="boolean") - expected[a._mask.to_numpy()] = np.nan + expected[a._mask] = np.nan tm.assert_extension_array_equal(result, expected) s = pd.Series(a) result = ufunc(s, a) expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") - expected[a._mask.to_numpy()] = np.nan + expected[a._mask] = np.nan tm.assert_series_equal(result, expected) # Boolean with numpy array arr = np.array([True, True, False]) result = ufunc(a, arr) expected = pd.array(ufunc(a._data, arr), dtype="boolean") - expected[a._mask.to_numpy()] = np.nan + expected[a._mask] = np.nan tm.assert_extension_array_equal(result, expected) result = ufunc(arr, a) expected = pd.array(ufunc(arr, a._data), dtype="boolean") - expected[a._mask.to_numpy()] = np.nan + expected[a._mask] = np.nan tm.assert_extension_array_equal(result, expected) # BooleanArray with scalar result = ufunc(a, True) expected = pd.array(ufunc(a._data, True), dtype="boolean") - expected[a._mask.to_numpy()] = np.nan + expected[a._mask] = np.nan tm.assert_extension_array_equal(result, expected) result = ufunc(True, a) expected = pd.array(ufunc(True, a._data), dtype="boolean") - expected[a._mask.to_numpy()] = np.nan + expected[a._mask] = np.nan tm.assert_extension_array_equal(result, expected) # not handled types @@ -56,7 +56,7 @@ def test_ufuncs_unary(ufunc): a = pd.array([True, False, None], dtype="boolean") result = ufunc(a) expected = pd.array(ufunc(a._data), dtype="boolean") - expected[a._mask.to_numpy()] = np.nan + expected[a._mask] = np.nan tm.assert_extension_array_equal(result, expected) ser = pd.Series(a) diff --git a/pandas/tests/arrays/boolean/test_logical.py b/pandas/tests/arrays/boolean/test_logical.py index 4cdaf3a90b21d..66c117ea3fc66 100644 --- a/pandas/tests/arrays/boolean/test_logical.py +++ b/pandas/tests/arrays/boolean/test_logical.py @@ -238,8 +238,8 @@ def test_no_masked_assumptions(self, other, all_logical_operators): tm.assert_extension_array_equal(result, expected) if isinstance(other, BooleanArray): - other._data[other._mask.to_numpy()] = True - a._data[a._mask.to_numpy()] = False + other._data[other._mask] = True + a._data[a._mask] = False result = getattr(a, all_logical_operators)(other) expected = getattr(b, all_logical_operators)(other) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index a7d5ecda3c644..d2f9f6dffab49 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -146,7 +146,6 @@ def test_astype_object_timestamp_categories(self): expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object") tm.assert_numpy_array_equal(result, expected) - @pytest.skip("not applicable with bitmask") def test_astype_category_readonly_mask_values(self): # GH#53658 arr = array([0, 1, 2], dtype="Int64") diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 8ee291d6fd6f5..056c22d8c1131 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -67,7 +67,7 @@ def test_pow_scalar(dtype): # TODO np.nan should be converted to pd.NA / missing before operation? expected = FloatingArray( np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), - mask=a._mask.to_numpy(), + mask=a._mask, ) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 699153b2c0639..4007ee6b415c9 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -23,7 +23,7 @@ def test_floating_array_constructor(): expected = pd.array([1, 2, 3, np.nan], dtype="Float64") tm.assert_extension_array_equal(result, expected) tm.assert_numpy_array_equal(result._data, values) - tm.assert_numpy_array_equal(result._mask.to_numpy(), mask) + tm.assert_numpy_array_equal(result._mask, mask) msg = r".* should be .* numpy array. Use the 'pd.array' function instead" with pytest.raises(TypeError, match=msg): @@ -62,11 +62,11 @@ def test_floating_array_constructor_copy(): result = FloatingArray(values, mask) assert result._data is values - # assert result._mask is mask + assert result._mask is mask result = FloatingArray(values, mask, copy=True) assert result._data is not values - # assert result._mask is not mask + assert result._mask is not mask def test_to_array(): diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 286d884994c44..ce6c245cd0f37 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -248,7 +248,7 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators): # rmod results in NaN that wasn't NA in original nullable Series -> unmask it if all_arithmetic_operators == "__rmod__": mask = (s == 0).fillna(False).to_numpy(bool) - expected.array._mask[mask.to_numpy()] = False + expected.array._mask[mask] = False tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index f6ef5db17044b..9ecfc51cb2208 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -100,11 +100,11 @@ def test_integer_array_constructor_copy(): result = IntegerArray(values, mask) assert result._data is values - # assert result._mask is mask + assert result._mask is mask result = IntegerArray(values, mask, copy=True) assert result._data is not values - # assert result._mask is not mask + assert result._mask is not mask @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 312fa90844847..f50b4cfd0b520 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -163,7 +163,7 @@ def test_astype_copy(): result = arr.astype("Int64", copy=False) assert result is arr assert np.shares_memory(result._data, arr._data) - # assert np.shares_memory(result._mask, arr._mask) + assert np.shares_memory(result._mask, arr._mask) result[0] = 10 assert arr[0] == 10 result[0] = pd.NA diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 40c9dcc697f46..d48b636a98feb 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -26,7 +26,7 @@ def test_ufuncs_single_float(ufunc): a = pd.array([1, 2, -3, np.nan]) with np.errstate(invalid="ignore"): result = ufunc(a) - expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask.to_numpy()) + expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask) tm.assert_extension_array_equal(result, expected) s = pd.Series(a) diff --git a/pandas/tests/arrays/masked_shared.py b/pandas/tests/arrays/masked_shared.py index 7adaf5f0a5859..3e74402263cf9 100644 --- a/pandas/tests/arrays/masked_shared.py +++ b/pandas/tests/arrays/masked_shared.py @@ -16,7 +16,7 @@ def _compare_other(self, data, op, other): expected = pd.Series(op(data._data, other), dtype="boolean") # fill the nan locations - expected[data._mask.to_numpy()] = pd.NA + expected[data._mask] = pd.NA tm.assert_series_equal(result, expected) @@ -28,7 +28,7 @@ def _compare_other(self, data, op, other): expected = op(pd.Series(data._data), other).astype("boolean") # fill the nan locations - expected[data._mask.to_numpy()] = pd.NA + expected[data._mask] = pd.NA tm.assert_series_equal(result, expected) @@ -43,7 +43,7 @@ def test_scalar(self, other, comparison_op, dtype): expected = pd.array([None, None, None], dtype="boolean") else: values = op(left._data, other) - expected = pd.arrays.BooleanArray(values, left._mask.to_numpy(), copy=True) + expected = pd.arrays.BooleanArray(values, left._mask, copy=True) tm.assert_extension_array_equal(result, expected) # ensure we haven't mutated anything inplace @@ -74,7 +74,7 @@ def test_array(self, comparison_op, dtype): result = op(left, right) values = op(left._data, right._data) - mask = left._mask.to_numpy() | right._mask.to_numpy() + mask = left._mask | right._mask expected = pd.arrays.BooleanArray(values, mask) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index eb7b2f585b426..3b627f2fae845 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -277,11 +277,9 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._data, result._values._data, check_same="same" ) - # assert np.shares_memory(index._values._mask, result._values._mask) + assert np.shares_memory(index._values._mask, result._values._mask) tm.assert_numpy_array_equal( - index._values._mask.to_numpy(), - result._values._mask.to_numpy(), - check_same="same", + index._values._mask, result._values._mask, check_same="same" ) elif index.dtype == "string[python]": assert np.shares_memory(index._values._ndarray, result._values._ndarray) From dabe1b6efb5ce11e0a0cf65fae88e7259ee914ca Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 12 Aug 2023 08:08:24 -0400 Subject: [PATCH 009/126] implement __or__ --- pandas/_libs/arrays.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index c8440de16bc38..242e2d0ea8667 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -242,6 +242,9 @@ cdef class BitMaskArray: def __invert__(self): return ~self.to_numpy() + def __or__(self, other): + return self.to_numpy().__or__(other) + def to_numpy(self) -> ndarray: cdef ndarray[uint8_t] result result = np.empty(self.array_len, dtype=bool) From 28f7ab18661a63d052c95a2c0d310c4f5671725d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 12 Aug 2023 13:33:08 -0400 Subject: [PATCH 010/126] checkpoint --- pandas/_libs/arrays.pyx | 110 ++++++++++++++++++++++---- pandas/_libs/index.pyx | 2 +- pandas/core/algorithms.py | 2 +- pandas/core/arrays/boolean.py | 2 +- pandas/core/arrays/masked.py | 34 ++++---- pandas/core/arrays/numeric.py | 2 +- pandas/core/reshape/merge.py | 4 +- pandas/tests/indexes/test_old_base.py | 8 +- 8 files changed, 123 insertions(+), 41 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 242e2d0ea8667..27f36c80802a8 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -21,7 +21,10 @@ from libc.stdlib cimport ( malloc, ) -from pandas._libs.lib import is_list_like +from pandas._libs.lib import ( + is_list_like, + is_scalar, +) cdef extern from "pandas/vendored/nanoarrow.h": @@ -209,34 +212,105 @@ cdef class NDArrayBacked: return to_concat[0]._from_backing_data(new_arr) +def _unpickle_bitmaskarray(array): + bma = BitMaskArray(array) + return bma + + cdef class BitMaskArray: - cdef array_len cdef uint8_t* validity_buffer - - def __cinit__(self, np_array): - self.array_len = len(np_array) - nbytes = len(np_array) // 8 + 1 - self.validity_buffer = malloc(nbytes) - for index, value in enumerate(np_array): - self[index] = value + cdef public: + int array_len + int nbytes + + def __cinit__(self, data): + if isinstance(data, np.ndarray): + self.array_len = len(data) + self.nbytes = len(data) // 8 + 1 + self.validity_buffer = malloc(self.nbytes) + for index, value in enumerate(data): + self[index] = value + elif isinstance(data, type(self)): + self.array_len = data.array_len + self.nbytes = data.nbytes + self.validity_buffer = malloc(self.nbytes) + + # TODO: tried making validity_buffer public with memcpy but got + # Cannot convert Python object to 'const void *' error + for i in range(self.nbytes): + if data[i]: + ArrowBitSet(self.validity_buffer, i) + else: + ArrowBitClear(self.validity_buffer, i) + else: + raise TypeError("Unsupported argument to BitMaskArray constructor") def __dealloc__(self): free(self.validity_buffer) def __setitem__(self, key, value): if is_list_like(key): - for k in key: + if is_scalar(value): + for index, k in enumerate(key): + if not k: + continue + if value: + ArrowBitSet(self.validity_buffer, index) + else: + ArrowBitClear(self.validity_buffer, index) + else: + if len(key) != len(value): + raise ValueError("Must provide an equal number of elements to mask") + for index, (k, v) in enumerate(zip(key, value)): + if not k: + continue + if v: + ArrowBitSet(self.validity_buffer, index) + else: + ArrowBitClear(self.validity_buffer, index) + elif isinstance(key, slice): + pos = key.start if key.start else 0 + end = key.stop + step = key.step if key.step else 1 + + if not end: + return + + if step > 0: + while pos < end: + if value: + ArrowBitSet(self.validity_buffer, pos) + else: + ArrowBitClear(self.validity_buffer, pos) + + pos += step + elif step < 0: + while pos > end: + if value: + ArrowBitSet(self.validity_buffer, pos) + else: + ArrowBitClear(self.validity_buffer, pos) + + pos += step + else: + if is_scalar(value): if value: - ArrowBitSet(self.validity_buffer, k) + ArrowBitSet(self.validity_buffer, key) else: - ArrowBitClear(self.validity_buffer, k) - else: - if value: - ArrowBitSet(self.validity_buffer, key) + ArrowBitClear(self.validity_buffer, key) else: - ArrowBitClear(self.validity_buffer, key) + for val in value: + if val: + ArrowBitSet(self.validity_buffer, key) + else: + ArrowBitClear(self.validity_buffer, key) def __getitem__(self, key): + if is_list_like(key): + return np.array([bool(ArrowBitGet(self.validity_buffer, k)) for k in key]) + elif isinstance(key, slice): + return self.to_numpy()[key] + return bool(ArrowBitGet(self.validity_buffer, key)) def __invert__(self): @@ -245,6 +319,10 @@ cdef class BitMaskArray: def __or__(self, other): return self.to_numpy().__or__(other) + def __reduce__(self): + object_state = (self.to_numpy(),) + return (_unpickle_bitmaskarray, object_state) + def to_numpy(self) -> ndarray: cdef ndarray[uint8_t] result result = np.empty(self.array_len, dtype=bool) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index e974b5d0eec46..24c81b01e897b 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1182,7 +1182,7 @@ cdef class MaskedIndexEngine(IndexEngine): def _get_mask(self, object values) -> np.ndarray: if hasattr(values, "_mask"): - return values._mask + return values._mask.to_numpy() # We are an ArrowExtensionArray return values.isna() diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 14dee202a9d8d..3861a18316563 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -998,7 +998,7 @@ def duplicated( """ if hasattr(values, "dtype") and isinstance(values.dtype, BaseMaskedDtype): values = cast("BaseMaskedArray", values) - return htable.duplicated(values._data, keep=keep, mask=values._mask) + return htable.duplicated(values._data, keep=keep, mask=values._mask.to_numpy()) values = _ensure_data(values) return htable.duplicated(values, keep=keep) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 43344f04085ae..f63f642b37450 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -168,7 +168,7 @@ def coerce_to_array( if isinstance(values, BooleanArray): if mask is not None: raise ValueError("cannot pass mask for BooleanArray input") - values, mask = values._data, values._mask + values, mask = values._data, values._mask.to_numpy() if copy: values = values.copy() mask = mask.copy() diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index fb47982d3807c..5e5813d405f98 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -237,7 +237,7 @@ def fillna( if method is not None: func = missing.get_fill_func(method, ndim=self.ndim) npvalues = self._data.T - new_mask = mask.to_numpy().T + new_mask = mask.T if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() @@ -623,7 +623,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): inputs2 = [] for x in inputs: if isinstance(x, BaseMaskedArray): - mask |= x._mask + mask |= x._mask.to_numpy() inputs2.append(x._data) else: inputs2.append(x) @@ -1095,7 +1095,7 @@ def equals(self, other) -> bool: # GH#44382 if e.g. self[1] is np.nan and other[1] is pd.NA, we are NOT # equal. - if not np.array_equal(self._mask.to_numpy(), other._mask): + if not np.array_equal(self._mask.to_numpy(), other._mask.to_numpy()): return False left = self._data[~self._mask.to_numpy()] @@ -1221,7 +1221,7 @@ def sum( result = masked_reductions.sum( self._data, - self._mask, + self._mask.to_numpy(), skipna=skipna, min_count=min_count, axis=axis, @@ -1242,7 +1242,7 @@ def prod( result = masked_reductions.prod( self._data, - self._mask, + self._mask.to_numpy(), skipna=skipna, min_count=min_count, axis=axis, @@ -1255,7 +1255,7 @@ def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_mean((), kwargs) result = masked_reductions.mean( self._data, - self._mask, + self._mask.to_numpy(), skipna=skipna, axis=axis, ) @@ -1267,7 +1267,7 @@ def var( nv.validate_stat_ddof_func((), kwargs, fname="var") result = masked_reductions.var( self._data, - self._mask, + self._mask.to_numpy(), skipna=skipna, axis=axis, ddof=ddof, @@ -1280,7 +1280,7 @@ def std( nv.validate_stat_ddof_func((), kwargs, fname="std") result = masked_reductions.std( self._data, - self._mask, + self._mask.to_numpy(), skipna=skipna, axis=axis, ddof=ddof, @@ -1291,7 +1291,7 @@ def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_min((), kwargs) result = masked_reductions.min( self._data, - self._mask, + self._mask.to_numpy(), skipna=skipna, axis=axis, ) @@ -1301,7 +1301,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_max((), kwargs) result = masked_reductions.max( self._data, - self._mask, + self._mask.to_numpy(), skipna=skipna, axis=axis, ) @@ -1378,7 +1378,9 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): # _NestedSequence[_SupportsArray[dtype[Any]]], # bool, int, float, complex, str, bytes, # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type] + np.putmask( + values, self._mask.to_numpy(), self._falsey_value + ) # type: ignore[arg-type] result = values.any() if skipna: return result @@ -1459,7 +1461,9 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): # _NestedSequence[_SupportsArray[dtype[Any]]], # bool, int, float, complex, str, bytes, # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type] + np.putmask( + values, self._mask.to_numpy(), self._truthy_value + ) # type: ignore[arg-type] result = values.all(axis=axis) if skipna: @@ -1474,7 +1478,7 @@ def _accumulate( self, name: str, *, skipna: bool = True, **kwargs ) -> BaseMaskedArray: data = self._data - mask = self._mask + mask = self._mask.to_numpy() op = getattr(masked_accumulations, name) data, mask = op(data, mask, skipna=skipna, **kwargs) @@ -1500,7 +1504,7 @@ def _groupby_op( op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) # libgroupby functions are responsible for NOT altering mask - mask = self._mask + mask = self._mask.to_numpy() if op.kind != "aggregate": result_mask = mask.copy() else: @@ -1537,7 +1541,7 @@ def transpose_homogeneous_masked_arrays( values = [arr._data.reshape(1, -1) for arr in masked_arrays] transposed_values = np.concatenate(values, axis=0) - masks = [arr._mask.reshape(1, -1) for arr in masked_arrays] + masks = [arr._mask.to_numpy().reshape(1, -1) for arr in masked_arrays] transposed_masks = np.concatenate(masks, axis=0) dtype = masked_arrays[0].dtype diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 0e86c1efba17a..304e4f4097a69 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -146,7 +146,7 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype cls = dtype_cls.construct_array_type() if isinstance(values, cls): - values, mask = values._data, values._mask + values, mask = values._data, values._mask.to_numpy() if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6987a0ac7bf6b..102000ebae57e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2449,8 +2449,8 @@ def _factorize_keys( if isinstance(lk, BaseMaskedArray): assert isinstance(rk, BaseMaskedArray) - llab = rizer.factorize(lk._data, mask=lk._mask) - rlab = rizer.factorize(rk._data, mask=rk._mask) + llab = rizer.factorize(lk._data, mask=lk._mask.to_numpy()) + rlab = rizer.factorize(rk._data, mask=rk._mask.to_numpy()) elif isinstance(lk, ArrowExtensionArray): assert isinstance(rk, ArrowExtensionArray) # we can only get here with numeric dtypes diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 3b627f2fae845..3250e16be64e9 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -277,10 +277,10 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._data, result._values._data, check_same="same" ) - assert np.shares_memory(index._values._mask, result._values._mask) - tm.assert_numpy_array_equal( - index._values._mask, result._values._mask, check_same="same" - ) + # assert np.shares_memory(index._values._mask, result._values._mask) + # tm.assert_numpy_array_equal( + # index._values._mask, result._values._mask, check_same="same" + # ) elif index.dtype == "string[python]": assert np.shares_memory(index._values._ndarray, result._values._ndarray) tm.assert_numpy_array_equal( From 43f3cbc4aa9a3dd625851b8f1058e7f2d477fe39 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 12 Aug 2023 15:40:34 -0400 Subject: [PATCH 011/126] more cleanups --- pandas/_libs/arrays.pyx | 80 ++++--------------- pandas/core/arrays/boolean.py | 16 ++-- pandas/core/arrays/masked.py | 12 +-- pandas/core/arrays/string_.py | 2 +- pandas/core/arrays/string_arrow.py | 2 +- .../tests/arrays/boolean/test_construction.py | 12 +-- pandas/tests/arrays/boolean/test_function.py | 16 ++-- pandas/tests/arrays/boolean/test_logical.py | 4 +- .../tests/arrays/categorical/test_astype.py | 1 + .../tests/arrays/floating/test_arithmetic.py | 5 +- pandas/tests/arrays/floating/test_astype.py | 2 +- .../tests/arrays/floating/test_comparison.py | 2 +- .../arrays/floating/test_construction.py | 6 +- .../tests/arrays/integer/test_construction.py | 4 +- pandas/tests/arrays/integer/test_dtypes.py | 2 +- pandas/tests/arrays/integer/test_function.py | 2 +- pandas/tests/arrays/masked_shared.py | 6 +- 17 files changed, 67 insertions(+), 107 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 27f36c80802a8..d6afa8b98e2bd 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -21,11 +21,6 @@ from libc.stdlib cimport ( malloc, ) -from pandas._libs.lib import ( - is_list_like, - is_scalar, -) - cdef extern from "pandas/vendored/nanoarrow.h": int8_t ArrowBitGet(const uint8_t*, int64_t) @@ -249,75 +244,34 @@ cdef class BitMaskArray: free(self.validity_buffer) def __setitem__(self, key, value): - if is_list_like(key): - if is_scalar(value): - for index, k in enumerate(key): - if not k: - continue - if value: - ArrowBitSet(self.validity_buffer, index) - else: - ArrowBitClear(self.validity_buffer, index) + if isinstance(key, int): + if value: + ArrowBitSet(self.validity_buffer, key) else: - if len(key) != len(value): - raise ValueError("Must provide an equal number of elements to mask") - for index, (k, v) in enumerate(zip(key, value)): - if not k: - continue - if v: - ArrowBitSet(self.validity_buffer, index) - else: - ArrowBitClear(self.validity_buffer, index) - elif isinstance(key, slice): - pos = key.start if key.start else 0 - end = key.stop - step = key.step if key.step else 1 - - if not end: - return - - if step > 0: - while pos < end: - if value: - ArrowBitSet(self.validity_buffer, pos) - else: - ArrowBitClear(self.validity_buffer, pos) - - pos += step - elif step < 0: - while pos > end: - if value: - ArrowBitSet(self.validity_buffer, pos) - else: - ArrowBitClear(self.validity_buffer, pos) - - pos += step + ArrowBitClear(self.validity_buffer, key) else: - if is_scalar(value): - if value: - ArrowBitSet(self.validity_buffer, key) + arr = self.to_numpy() + arr[key] = value + for index, val in enumerate(arr): + if val: + ArrowBitSet(self.validity_buffer, index) else: - ArrowBitClear(self.validity_buffer, key) - else: - for val in value: - if val: - ArrowBitSet(self.validity_buffer, key) - else: - ArrowBitClear(self.validity_buffer, key) + ArrowBitClear(self.validity_buffer, index) def __getitem__(self, key): - if is_list_like(key): - return np.array([bool(ArrowBitGet(self.validity_buffer, k)) for k in key]) - elif isinstance(key, slice): + if isinstance(key, int): + return bool(ArrowBitGet(self.validity_buffer, key)) + else: return self.to_numpy()[key] - return bool(ArrowBitGet(self.validity_buffer, key)) - def __invert__(self): return ~self.to_numpy() def __or__(self, other): - return self.to_numpy().__or__(other) + if isinstance(other, type(self)): + return self.to_numpy() | other.to_numpy() + else: + return self.to_numpy() | other def __reduce__(self): object_state = (self.to_numpy(),) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index f63f642b37450..03f2a2cd0e07e 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -360,7 +360,7 @@ def _logical_method(self, other, op): mask = None if isinstance(other, BooleanArray): - other, mask = other._data, other._mask + other, mask = other._data, other._mask.to_numpy() elif is_list_like(other): other = np.asarray(other, dtype="bool") if other.ndim > 1: @@ -379,12 +379,16 @@ def _logical_method(self, other, op): raise ValueError("Lengths must match") if op.__name__ in {"or_", "ror_"}: - result, mask = ops.kleene_or(self._data, other, self._mask, mask) + result, mask = ops.kleene_or(self._data, other, self._mask.to_numpy(), mask) elif op.__name__ in {"and_", "rand_"}: - result, mask = ops.kleene_and(self._data, other, self._mask, mask) + result, mask = ops.kleene_and( + self._data, other, self._mask.to_numpy(), mask + ) else: # i.e. xor, rxor - result, mask = ops.kleene_xor(self._data, other, self._mask, mask) + result, mask = ops.kleene_xor( + self._data, other, self._mask.to_numpy(), mask + ) # i.e. BooleanArray return self._maybe_mask_result(result, mask) @@ -393,7 +397,7 @@ def _accumulate( self, name: str, *, skipna: bool = True, **kwargs ) -> BaseMaskedArray: data = self._data - mask = self._mask + mask = self._mask.to_numpy() if name in ("cummin", "cummax"): op = getattr(masked_accumulations, name) data, mask = op(data, mask, skipna=skipna, **kwargs) @@ -401,6 +405,6 @@ def _accumulate( else: from pandas.core.arrays import IntegerArray - return IntegerArray(data.astype(int), mask)._accumulate( + return IntegerArray(data.astype(int), mask.to_numpy())._accumulate( name, skipna=skipna, **kwargs ) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 5e5813d405f98..ed63b06760e69 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -201,13 +201,13 @@ def pad_or_backfill( limit_area: Literal["inside", "outside"] | None = None, copy: bool = True, ) -> Self: - mask = self._mask + mask = self._mask.to_numpy() if mask.any(): func = missing.get_fill_func(method, ndim=self.ndim) npvalues = self._data.T - new_mask = mask.to_numpy().T + new_mask = mask.T if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() @@ -696,7 +696,7 @@ def _propagate_mask( elif is_list_like(other) and len(other) == len(mask): mask = mask | isna(other) else: - mask = self._mask.to_numpy() | mask.to_numpy() + mask = self._mask | mask # Incompatible return value type (got "Optional[ndarray[Any, dtype[bool_]]]", # expected "ndarray[Any, dtype[bool_]]") return mask # type: ignore[return-value] @@ -802,7 +802,7 @@ def _cmp_method(self, other, op) -> BooleanArray: mask = None if isinstance(other, BaseMaskedArray): - other, mask = other._data, other._mask + other, mask = other._data, other._mask.to_numpy() elif is_list_like(other): other = np.asarray(other) @@ -869,7 +869,7 @@ def _maybe_mask_result( # e.g. test_numeric_arr_mul_tdscalar_numexpr_path from pandas.core.arrays import TimedeltaArray - result[mask.to_numpy()] = result.dtype.type("NaT") + result[mask] = result.dtype.type("NaT") if not isinstance(result, TimedeltaArray): return TimedeltaArray._simple_new(result, dtype=result.dtype) @@ -882,7 +882,7 @@ def _maybe_mask_result( return IntegerArray(result, mask, copy=False) else: - result[mask.to_numpy()] = np.nan + result[mask] = np.nan return result def isna(self) -> np.ndarray: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 25f1c2ec6ce4f..b55e4fa0b3a72 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -349,7 +349,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype - na_values = scalars._mask + na_values = scalars._mask.to_numpy() result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) result[na_values] = libmissing.NA diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4a70fcf6b5a93..27d38bf4b6152 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -149,7 +149,7 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and # numerical issues with Float32Dtype - na_values = scalars._mask + na_values = scalars._mask.to_numpy() result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) return cls(pa.array(result, mask=na_values, type=pa.string())) diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index d26eea19c06e9..89f8e52fe21c8 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -40,11 +40,11 @@ def test_boolean_array_constructor_copy(): result = BooleanArray(values, mask) assert result._data is values - assert result._mask is mask + # assert result._mask is mask result = BooleanArray(values, mask, copy=True) assert result._data is not values - assert result._mask is not mask + # assert result._mask is not mask def test_to_boolean_array(): @@ -159,12 +159,12 @@ def test_coerce_to_array(): expected = BooleanArray(values, mask) tm.assert_extension_array_equal(result, expected) assert result._data is values - assert result._mask is mask + # assert result._mask is mask result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) expected = BooleanArray(values, mask) tm.assert_extension_array_equal(result, expected) assert result._data is not values - assert result._mask is not mask + # assert result._mask is not mask # mixed missing from values and mask values = [True, False, None, False] @@ -202,12 +202,12 @@ def test_coerce_to_array_from_boolean_array(): tm.assert_extension_array_equal(result, arr) # no copy assert result._data is arr._data - assert result._mask is arr._mask + # assert result._mask is arr._mask result = BooleanArray(*coerce_to_array(arr), copy=True) tm.assert_extension_array_equal(result, arr) assert result._data is not arr._data - assert result._mask is not arr._mask + # assert result._mask is not arr._mask with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): coerce_to_array(arr, mask=mask) diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 2b3f3d3d16ac6..b58ec19dff329 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -13,36 +13,36 @@ def test_ufuncs_binary(ufunc): a = pd.array([True, False, None], dtype="boolean") result = ufunc(a, a) expected = pd.array(ufunc(a._data, a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) s = pd.Series(a) result = ufunc(s, a) expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_series_equal(result, expected) # Boolean with numpy array arr = np.array([True, True, False]) result = ufunc(a, arr) expected = pd.array(ufunc(a._data, arr), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) result = ufunc(arr, a) expected = pd.array(ufunc(arr, a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) # BooleanArray with scalar result = ufunc(a, True) expected = pd.array(ufunc(a._data, True), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) result = ufunc(True, a) expected = pd.array(ufunc(True, a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) # not handled types @@ -56,13 +56,13 @@ def test_ufuncs_unary(ufunc): a = pd.array([True, False, None], dtype="boolean") result = ufunc(a) expected = pd.array(ufunc(a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) ser = pd.Series(a) result = ufunc(ser) expected = pd.Series(ufunc(a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/boolean/test_logical.py b/pandas/tests/arrays/boolean/test_logical.py index 66c117ea3fc66..4cdaf3a90b21d 100644 --- a/pandas/tests/arrays/boolean/test_logical.py +++ b/pandas/tests/arrays/boolean/test_logical.py @@ -238,8 +238,8 @@ def test_no_masked_assumptions(self, other, all_logical_operators): tm.assert_extension_array_equal(result, expected) if isinstance(other, BooleanArray): - other._data[other._mask] = True - a._data[a._mask] = False + other._data[other._mask.to_numpy()] = True + a._data[a._mask.to_numpy()] = False result = getattr(a, all_logical_operators)(other) expected = getattr(b, all_logical_operators)(other) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index d2f9f6dffab49..ace785e6ae5c8 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -146,6 +146,7 @@ def test_astype_object_timestamp_categories(self): expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object") tm.assert_numpy_array_equal(result, expected) + @pytest.mark.skip(reason="Not applicable with bitmask backed arrays") def test_astype_category_readonly_mask_values(self): # GH#53658 arr = array([0, 1, 2], dtype="Int64") diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 056c22d8c1131..f7fd08361f5e1 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -67,7 +67,7 @@ def test_pow_scalar(dtype): # TODO np.nan should be converted to pd.NA / missing before operation? expected = FloatingArray( np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), - mask=a._mask, + mask=a._mask.to_numpy(), ) tm.assert_extension_array_equal(result, expected) @@ -88,7 +88,8 @@ def test_pow_scalar(dtype): result = np.nan**a expected = FloatingArray( - np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask + np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), + mask=a._mask.to_numpy(), ) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ade3dbd2c99da..cc193cc644ec4 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -88,7 +88,7 @@ def test_astype_copy(): result = arr.astype("Float64", copy=False) assert result is arr assert np.shares_memory(result._data, arr._data) - assert np.shares_memory(result._mask, arr._mask) + # assert np.shares_memory(result._mask, arr._mask) result[0] = 10 assert arr[0] == 10 result[0] = pd.NA diff --git a/pandas/tests/arrays/floating/test_comparison.py b/pandas/tests/arrays/floating/test_comparison.py index a429649f1ce1d..19eb02374d476 100644 --- a/pandas/tests/arrays/floating/test_comparison.py +++ b/pandas/tests/arrays/floating/test_comparison.py @@ -61,5 +61,5 @@ def test_equals_nan_vs_na(): # with mask[1] = True, the only difference is data[1], which should # not matter for equals - mask[1] = True + left._mask[1] = True assert left.equals(right) diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 4007ee6b415c9..699153b2c0639 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -23,7 +23,7 @@ def test_floating_array_constructor(): expected = pd.array([1, 2, 3, np.nan], dtype="Float64") tm.assert_extension_array_equal(result, expected) tm.assert_numpy_array_equal(result._data, values) - tm.assert_numpy_array_equal(result._mask, mask) + tm.assert_numpy_array_equal(result._mask.to_numpy(), mask) msg = r".* should be .* numpy array. Use the 'pd.array' function instead" with pytest.raises(TypeError, match=msg): @@ -62,11 +62,11 @@ def test_floating_array_constructor_copy(): result = FloatingArray(values, mask) assert result._data is values - assert result._mask is mask + # assert result._mask is mask result = FloatingArray(values, mask, copy=True) assert result._data is not values - assert result._mask is not mask + # assert result._mask is not mask def test_to_array(): diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index 9ecfc51cb2208..f6ef5db17044b 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -100,11 +100,11 @@ def test_integer_array_constructor_copy(): result = IntegerArray(values, mask) assert result._data is values - assert result._mask is mask + # assert result._mask is mask result = IntegerArray(values, mask, copy=True) assert result._data is not values - assert result._mask is not mask + # assert result._mask is not mask @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index f50b4cfd0b520..312fa90844847 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -163,7 +163,7 @@ def test_astype_copy(): result = arr.astype("Int64", copy=False) assert result is arr assert np.shares_memory(result._data, arr._data) - assert np.shares_memory(result._mask, arr._mask) + # assert np.shares_memory(result._mask, arr._mask) result[0] = 10 assert arr[0] == 10 result[0] = pd.NA diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index d48b636a98feb..40c9dcc697f46 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -26,7 +26,7 @@ def test_ufuncs_single_float(ufunc): a = pd.array([1, 2, -3, np.nan]) with np.errstate(invalid="ignore"): result = ufunc(a) - expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask) + expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask.to_numpy()) tm.assert_extension_array_equal(result, expected) s = pd.Series(a) diff --git a/pandas/tests/arrays/masked_shared.py b/pandas/tests/arrays/masked_shared.py index 3e74402263cf9..22caeb94a13a1 100644 --- a/pandas/tests/arrays/masked_shared.py +++ b/pandas/tests/arrays/masked_shared.py @@ -16,7 +16,7 @@ def _compare_other(self, data, op, other): expected = pd.Series(op(data._data, other), dtype="boolean") # fill the nan locations - expected[data._mask] = pd.NA + expected[data._mask.to_numpy()] = pd.NA tm.assert_series_equal(result, expected) @@ -28,7 +28,7 @@ def _compare_other(self, data, op, other): expected = op(pd.Series(data._data), other).astype("boolean") # fill the nan locations - expected[data._mask] = pd.NA + expected[data._mask.to_numpy()] = pd.NA tm.assert_series_equal(result, expected) @@ -43,7 +43,7 @@ def test_scalar(self, other, comparison_op, dtype): expected = pd.array([None, None, None], dtype="boolean") else: values = op(left._data, other) - expected = pd.arrays.BooleanArray(values, left._mask, copy=True) + expected = pd.arrays.BooleanArray(values, left._mask.to_numpy(), copy=True) tm.assert_extension_array_equal(result, expected) # ensure we haven't mutated anything inplace From 902cef986c666895b9df003d94f84430f8a0b233 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 12 Aug 2023 15:45:00 -0400 Subject: [PATCH 012/126] groupby support --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dbb2d0e25de2e..bfb47db7fc43a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4354,7 +4354,7 @@ def post_processor( def blk_func(values: ArrayLike) -> ArrayLike: orig_vals = values if isinstance(values, BaseMaskedArray): - mask = values._mask + mask = values._mask.to_numpy() result_mask = np.zeros((ngroups, nqs), dtype=np.bool_) else: mask = isna(values) From 4d4ebfed2bb1c42746bd8a4bc2235cb1ce964a26 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 13 Aug 2023 03:25:49 -0400 Subject: [PATCH 013/126] prep for 2d --- pandas/core/arrays/boolean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 03f2a2cd0e07e..63c12efa4bc5c 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -405,6 +405,6 @@ def _accumulate( else: from pandas.core.arrays import IntegerArray - return IntegerArray(data.astype(int), mask.to_numpy())._accumulate( + return IntegerArray(data.astype(int), mask)._accumulate( name, skipna=skipna, **kwargs ) From 2898bb1a98118bd71fda12a1ad69cf8e3056ef7d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 13 Aug 2023 03:39:37 -0400 Subject: [PATCH 014/126] support 2D --- pandas/_libs/arrays.pyx | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index d6afa8b98e2bd..5cb21178e9f57 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -215,18 +215,21 @@ def _unpickle_bitmaskarray(array): cdef class BitMaskArray: cdef uint8_t* validity_buffer cdef public: - int array_len + int array_size int nbytes + object array_shape def __cinit__(self, data): if isinstance(data, np.ndarray): - self.array_len = len(data) - self.nbytes = len(data) // 8 + 1 + self.array_size = data.size + self.array_shape = data.shape + self.nbytes = self.array_size // 8 + 1 self.validity_buffer = malloc(self.nbytes) - for index, value in enumerate(data): + for index, value in enumerate(data.flatten()): self[index] = value elif isinstance(data, type(self)): - self.array_len = data.array_len + self.array_size = data.array_size + self.array_shape = data.shape self.nbytes = data.nbytes self.validity_buffer = malloc(self.nbytes) @@ -252,7 +255,7 @@ cdef class BitMaskArray: else: arr = self.to_numpy() arr[key] = value - for index, val in enumerate(arr): + for index, val in enumerate(arr.flatten()): if val: ArrowBitSet(self.validity_buffer, index) else: @@ -279,8 +282,8 @@ cdef class BitMaskArray: def to_numpy(self) -> ndarray: cdef ndarray[uint8_t] result - result = np.empty(self.array_len, dtype=bool) - for i in range(self.array_len): + result = np.empty(self.array_size, dtype=bool) + for i in range(self.array_size): result[i] = self[i] - return result + return result.reshape(self.array_shape) From 108a86cbd8796010ce9cf6a8669944f145243a66 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 13 Aug 2023 03:57:07 -0400 Subject: [PATCH 015/126] fix numeric --- pandas/core/tools/numeric.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index a50dbeb110bff..e3e84227b096f 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -203,7 +203,7 @@ def to_numeric( # save mask to reconstruct the full array after casting mask: npt.NDArray[np.bool_] | None = None if isinstance(values, BaseMaskedArray): - mask = values._mask + mask = values._mask.to_numpy() values = values._data[~mask] values_dtype = getattr(values, "dtype", None) @@ -278,8 +278,7 @@ def to_numeric( if mask is None or (new_mask is not None and new_mask.shape == mask.shape): # GH 52588 mask = new_mask - else: - mask = mask.copy() + assert isinstance(mask, np.ndarray) data = np.zeros(mask.shape, dtype=values.dtype) data[~mask] = values From 8decf2a6bc49c379e114e8367a60228953ade6cb Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 13 Aug 2023 04:00:40 -0400 Subject: [PATCH 016/126] temp pass for CI --- pandas/tests/frame/methods/test_quantile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 61b253b24a7ec..90f6ea65b823a 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -863,6 +863,7 @@ def test_quantile_ea(self, request, obj, index): tm.assert_equal(result, expected) + @pytest.mark.skip(reason="need to fix negative indexer with bitmask") def test_quantile_ea_with_na(self, obj, index): obj.iloc[0] = index._na_value obj.iloc[-1] = index._na_value From 3da7aa2523c0290c157613ad4330f3a6a34c1e8d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 13 Aug 2023 05:04:44 -0400 Subject: [PATCH 017/126] fixed negative indexing --- pandas/_libs/arrays.pyx | 4 ++-- pandas/tests/frame/methods/test_quantile.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 5cb21178e9f57..033ca2a6e4f87 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -247,7 +247,7 @@ cdef class BitMaskArray: free(self.validity_buffer) def __setitem__(self, key, value): - if isinstance(key, int): + if isinstance(key, int) and key >= 0: if value: ArrowBitSet(self.validity_buffer, key) else: @@ -262,7 +262,7 @@ cdef class BitMaskArray: ArrowBitClear(self.validity_buffer, index) def __getitem__(self, key): - if isinstance(key, int): + if isinstance(key, int) and key >= 0: return bool(ArrowBitGet(self.validity_buffer, key)) else: return self.to_numpy()[key] diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 90f6ea65b823a..61b253b24a7ec 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -863,7 +863,6 @@ def test_quantile_ea(self, request, obj, index): tm.assert_equal(result, expected) - @pytest.mark.skip(reason="need to fix negative indexer with bitmask") def test_quantile_ea_with_na(self, obj, index): obj.iloc[0] = index._na_value obj.iloc[-1] = index._na_value From 11467c722e47fdf9bf56295f0095c01812968f50 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 13 Aug 2023 08:27:53 -0400 Subject: [PATCH 018/126] fixed copying --- pandas/_libs/arrays.pyx | 37 +++++++++++++++++--------------- pandas/core/arrays/masked.py | 8 ++++++- pandas/core/arrays/timedeltas.py | 2 +- 3 files changed, 28 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 033ca2a6e4f87..4699e44d569ee 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -213,13 +213,15 @@ def _unpickle_bitmaskarray(array): cdef class BitMaskArray: - cdef uint8_t* validity_buffer cdef public: int array_size int nbytes object array_shape + object parent + uint8_t* validity_buffer def __cinit__(self, data): + self.parent = None if isinstance(data, np.ndarray): self.array_size = data.size self.array_shape = data.shape @@ -228,26 +230,19 @@ cdef class BitMaskArray: for index, value in enumerate(data.flatten()): self[index] = value elif isinstance(data, type(self)): - self.array_size = data.array_size - self.array_shape = data.shape - self.nbytes = data.nbytes - self.validity_buffer = malloc(self.nbytes) - - # TODO: tried making validity_buffer public with memcpy but got - # Cannot convert Python object to 'const void *' error - for i in range(self.nbytes): - if data[i]: - ArrowBitSet(self.validity_buffer, i) - else: - ArrowBitClear(self.validity_buffer, i) + self.parent = data + # other attributes are undefined when a parent exists else: raise TypeError("Unsupported argument to BitMaskArray constructor") def __dealloc__(self): - free(self.validity_buffer) + if not self.parent: + free(self.validity_buffer) def __setitem__(self, key, value): - if isinstance(key, int) and key >= 0: + if self.parent is not None: + self.parent.__setitem__(key, value) + elif isinstance(key, int) and key >= 0: if value: ArrowBitSet(self.validity_buffer, key) else: @@ -262,16 +257,22 @@ cdef class BitMaskArray: ArrowBitClear(self.validity_buffer, index) def __getitem__(self, key): - if isinstance(key, int) and key >= 0: + if self.parent is not None: + return self.parent.__getitem__(key) + elif isinstance(key, int) and key >= 0: return bool(ArrowBitGet(self.validity_buffer, key)) else: return self.to_numpy()[key] def __invert__(self): + if self.parent is not None: + return ~self.parent return ~self.to_numpy() def __or__(self, other): - if isinstance(other, type(self)): + if self.parent is not None: + return self.parent.__or__(other) + elif isinstance(other, type(self)): return self.to_numpy() | other.to_numpy() else: return self.to_numpy() | other @@ -281,6 +282,8 @@ cdef class BitMaskArray: return (_unpickle_bitmaskarray, object_state) def to_numpy(self) -> ndarray: + if self.parent is not None: + return self.parent.to_numpy() cdef ndarray[uint8_t] result result = np.empty(self.array_size, dtype=bool) for i in range(self.array_size): diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index ed63b06760e69..bb1be8cc2a4d0 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -120,7 +120,9 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): _falsey_value = Scalar # bool(_falsey_value) = False @classmethod - def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self: + def _simple_new( + cls, values: np.ndarray, mask: npt.NDArray[np.bool_] | BitMaskArray + ) -> Self: result = BaseMaskedArray.__new__(cls) result._data = values result._mask = BitMaskArray(mask) @@ -191,6 +193,10 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self.dtype.na_value return self._data[item] + # sending self._mask avoids copy of buffer + if np.array_equal(newmask, self._mask.to_numpy()): + return self._simple_new(self._data[item], self._mask) + return self._simple_new(self._data[item], newmask) def pad_or_backfill( diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a81609e1bb618..135850ee37991 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -1071,7 +1071,7 @@ def sequence_to_td64ns( # cast the unit, multiply base/frac separately # to avoid precision issues from float -> int if isinstance(data.dtype, ExtensionDtype): - mask = data._mask + mask = data._mask.to_numpy() data = data._data else: mask = np.isnan(data) From d91fb8e0b47bb71579e0b7de0c1c1c41258b1ff8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 13 Aug 2023 09:38:44 -0400 Subject: [PATCH 019/126] Working --- pandas/core/arrays/boolean.py | 9 +++-- pandas/core/arrays/masked.py | 35 +++++++++++-------- pandas/core/arrays/numeric.py | 6 +++- .../tests/arrays/boolean/test_construction.py | 8 ++--- pandas/tests/arrays/floating/test_astype.py | 2 +- .../arrays/floating/test_construction.py | 7 ++-- .../tests/arrays/integer/test_construction.py | 7 ++-- pandas/tests/arrays/integer/test_dtypes.py | 2 +- pandas/tests/indexes/test_old_base.py | 10 +++--- 9 files changed, 54 insertions(+), 32 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 63c12efa4bc5c..62ae43f529204 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -27,6 +27,7 @@ if TYPE_CHECKING: import pyarrow + from pandas._libs.arrays import BitMaskArray from pandas._typing import ( Dtype, DtypeObj, @@ -168,7 +169,7 @@ def coerce_to_array( if isinstance(values, BooleanArray): if mask is not None: raise ValueError("cannot pass mask for BooleanArray input") - values, mask = values._data, values._mask.to_numpy() + values, mask = values._data, values._mask if copy: values = values.copy() mask = mask.copy() @@ -298,13 +299,15 @@ class BooleanArray(BaseMaskedArray): _FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"} @classmethod - def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self: + def _simple_new( + cls, values: np.ndarray, mask: npt.NDArray[np.bool_] | BitMaskArray + ) -> Self: result = super()._simple_new(values, mask) result._dtype = BooleanDtype() return result def __init__( - self, values: np.ndarray, mask: np.ndarray, copy: bool = False + self, values: np.ndarray, mask: np.ndarray | BitMaskArray, copy: bool = False ) -> None: if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): raise TypeError( diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index bb1be8cc2a4d0..e14538a210c07 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -129,20 +129,31 @@ def _simple_new( return result def __init__( - self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False + self, + values: np.ndarray, + mask: npt.NDArray[np.bool_] | BitMaskArray, + copy: bool = False, ) -> None: # values is supposed to already be validated in the subclass - if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + if not ( + isinstance(mask, BitMaskArray) + or (isinstance(mask, np.ndarray) and mask.dtype == np.bool_) + ): raise TypeError( - "mask should be boolean numpy array. Use " - "the 'pd.array' function instead" + "mask should be boolean numpy array or BitMaskArray. " + "Use the 'pd.array' function instead" ) - if values.shape != mask.shape: - raise ValueError("values.shape must match mask.shape") + if isinstance(mask, np.ndarray): + if values.shape != mask.shape: + raise ValueError("values.shape must match mask.shape") - if copy: - values = values.copy() - mask = mask.copy() + if copy: + values = values.copy() + mask = mask.copy() + else: + if copy: + values = values.copy() + mask = mask.to_numpy() self._data = values self._mask = BitMaskArray(mask) @@ -551,11 +562,7 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: data = self._data.astype(dtype.numpy_dtype, copy=copy) # mask is copied depending on whether the data was copied, and # not directly depending on the `copy` keyword - mask = ( - self._mask.to_numpy() - if data is self._data - else self._mask.to_numpy().copy() - ) + mask = self._mask if data is self._data else self._mask.to_numpy().copy() cls = dtype.construct_array_type() return cls(data, mask, copy=False) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 304e4f4097a69..76903074c763d 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -32,6 +32,7 @@ import pyarrow + from pandas._libs.arrays import BitMaskArray from pandas._typing import ( Dtype, DtypeObj, @@ -230,7 +231,10 @@ class NumericArray(BaseMaskedArray): _dtype_cls: type[NumericDtype] def __init__( - self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False + self, + values: np.ndarray, + mask: npt.NDArray[np.bool_] | BitMaskArray, + copy: bool = False, ) -> None: checker = self._dtype_cls._checker if not (isinstance(values, np.ndarray) and checker(values.dtype)): diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index 89f8e52fe21c8..37745f589e26d 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -44,7 +44,7 @@ def test_boolean_array_constructor_copy(): result = BooleanArray(values, mask, copy=True) assert result._data is not values - # assert result._mask is not mask + assert result._mask is not mask def test_to_boolean_array(): @@ -164,7 +164,7 @@ def test_coerce_to_array(): expected = BooleanArray(values, mask) tm.assert_extension_array_equal(result, expected) assert result._data is not values - # assert result._mask is not mask + assert result._mask is not mask # mixed missing from values and mask values = [True, False, None, False] @@ -202,12 +202,12 @@ def test_coerce_to_array_from_boolean_array(): tm.assert_extension_array_equal(result, arr) # no copy assert result._data is arr._data - # assert result._mask is arr._mask + assert result._mask.parent is arr._mask result = BooleanArray(*coerce_to_array(arr), copy=True) tm.assert_extension_array_equal(result, arr) assert result._data is not arr._data - # assert result._mask is not arr._mask + assert result._mask.parent is not arr._mask with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): coerce_to_array(arr, mask=mask) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index cc193cc644ec4..b38a944238b38 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -88,7 +88,7 @@ def test_astype_copy(): result = arr.astype("Float64", copy=False) assert result is arr assert np.shares_memory(result._data, arr._data) - # assert np.shares_memory(result._mask, arr._mask) + assert result._mask is arr._mask result[0] = 10 assert arr[0] == 10 result[0] = pd.NA diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 699153b2c0639..3e9b669913749 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -25,7 +25,10 @@ def test_floating_array_constructor(): tm.assert_numpy_array_equal(result._data, values) tm.assert_numpy_array_equal(result._mask.to_numpy(), mask) - msg = r".* should be .* numpy array. Use the 'pd.array' function instead" + msg = ( + r".* should be .* numpy array( or BitMaskArray)?. " + r"Use the 'pd.array' function instead" + ) with pytest.raises(TypeError, match=msg): FloatingArray(values.tolist(), mask) @@ -66,7 +69,7 @@ def test_floating_array_constructor_copy(): result = FloatingArray(values, mask, copy=True) assert result._data is not values - # assert result._mask is not mask + assert result._mask is not mask def test_to_array(): diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index f6ef5db17044b..6cc240cd52aca 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -80,7 +80,10 @@ def test_integer_array_constructor(): expected = pd.array([1, 2, 3, np.nan], dtype="Int64") tm.assert_extension_array_equal(result, expected) - msg = r".* should be .* numpy array. Use the 'pd.array' function instead" + msg = ( + r".* should be .* numpy array( or BitMaskArray)?. " + r"Use the 'pd.array' function instead" + ) with pytest.raises(TypeError, match=msg): IntegerArray(values.tolist(), mask) @@ -104,7 +107,7 @@ def test_integer_array_constructor_copy(): result = IntegerArray(values, mask, copy=True) assert result._data is not values - # assert result._mask is not mask + assert result._mask is not mask @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 312fa90844847..70c2eeeb852c6 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -163,7 +163,7 @@ def test_astype_copy(): result = arr.astype("Int64", copy=False) assert result is arr assert np.shares_memory(result._data, arr._data) - # assert np.shares_memory(result._mask, arr._mask) + assert result._mask is arr._mask result[0] = 10 assert arr[0] == 10 result[0] = pd.NA diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 3250e16be64e9..6df45941df8cc 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -277,10 +277,12 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._data, result._values._data, check_same="same" ) - # assert np.shares_memory(index._values._mask, result._values._mask) - # tm.assert_numpy_array_equal( - # index._values._mask, result._values._mask, check_same="same" - # ) + assert index._values._mask is result._values._mask + tm.assert_numpy_array_equal( + index._values._mask.to_numpy(), + result._values._mask.to_numpy(), + check_same="copy", + ) elif index.dtype == "string[python]": assert np.shares_memory(index._values._ndarray, result._values._ndarray) tm.assert_numpy_array_equal( From 757605c8c9ae534b9afd278faf54da5342319f85 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 13 Aug 2023 10:04:12 -0400 Subject: [PATCH 020/126] cleanups --- pandas/core/arrays/masked.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e14538a210c07..93ff9216c57b8 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -329,7 +329,7 @@ def __contains__(self, key) -> bool: # GH#52840 if self._data.dtype.kind == "f" and lib.is_float(key): # TODO: implement low level invert operator on BitMaskArray - return bool((np.isnan(self._data) & ~self._mask.to_numpy()).any()) + return bool((np.isnan(self._data) & ~self._mask).any()) return bool(super().__contains__(key)) @@ -789,7 +789,7 @@ def _arith_method(self, other, op): if op_name == "pow": # 1 ** x is 1. - mask = np.where((self._data == 1) & ~self._mask.to_numpy(), False, mask) + mask = np.where((self._data == 1) & ~self._mask, False, mask) # x ** 0 is 1. if omask is not None: mask = np.where((other == 0) & ~omask, False, mask) @@ -803,7 +803,7 @@ def _arith_method(self, other, op): elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. - mask = np.where((self._data == 0) & ~self._mask.to_numpy(), False, mask) + mask = np.where((self._data == 0) & ~self._mask, False, mask) return self._maybe_mask_result(result, mask) @@ -1111,8 +1111,8 @@ def equals(self, other) -> bool: if not np.array_equal(self._mask.to_numpy(), other._mask.to_numpy()): return False - left = self._data[~self._mask.to_numpy()] - right = other._data[~other._mask.to_numpy()] + left = self._data[~self._mask] + right = other._data[~other._mask] return array_equivalent(left, right, strict_nan=True, dtype_equal=True) def _quantile( From 6fbbad897bb147641445c52c1e0596ebecb58c51 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 13 Aug 2023 10:18:37 -0400 Subject: [PATCH 021/126] fix --- pandas/core/arrays/masked.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 93ff9216c57b8..04952f8d8ff3c 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -322,6 +322,9 @@ def __setitem__(self, key, value) -> None: value, mask = self._coerce_to_array(value, dtype=self.dtype) self._data[key] = value + if isinstance(mask, BitMaskArray): + mask = mask.to_numpy() + self._mask[key] = mask def __contains__(self, key) -> bool: From b9723ab44c1806eb61d0845ff992b32f7d06f751 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 13 Aug 2023 23:11:13 -0400 Subject: [PATCH 022/126] cleanups and some performance boosts --- .pre-commit-config.yaml | 3 +++ pandas/_libs/arrays.pyx | 26 ++++++++++++++++---------- pandas/core/arrays/numeric.py | 3 +-- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 000949c41f5a0..bc6fb0e3bf99e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -46,6 +46,9 @@ repos: - id: codespell types_or: [python, rst, markdown, cython, c] additional_dependencies: [tomli] + exclude: | + ^pandas/_libs/include/pandas/vendored/nanoarrow.h + |pandas/_libs/src/vendored/nanoarrow.c - repo: https://github.com/MarcoGorelli/cython-lint rev: v0.15.0 hooks: diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 4699e44d569ee..c2b0b724b34cd 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -213,22 +213,25 @@ def _unpickle_bitmaskarray(array): cdef class BitMaskArray: - cdef public: + cdef: int array_size - int nbytes object array_shape - object parent uint8_t* validity_buffer + cdef public: + int nbytes + object parent def __cinit__(self, data): self.parent = None + cdef int index = 0 if isinstance(data, np.ndarray): self.array_size = data.size self.array_shape = data.shape self.nbytes = self.array_size // 8 + 1 self.validity_buffer = malloc(self.nbytes) - for index, value in enumerate(data.flatten()): + for value in data.flatten(): self[index] = value + index += 1 elif isinstance(data, type(self)): self.parent = data # other attributes are undefined when a parent exists @@ -240,8 +243,9 @@ cdef class BitMaskArray: free(self.validity_buffer) def __setitem__(self, key, value): + cdef int index = 0 if self.parent is not None: - self.parent.__setitem__(key, value) + self.parent[key] = value elif isinstance(key, int) and key >= 0: if value: ArrowBitSet(self.validity_buffer, key) @@ -250,15 +254,16 @@ cdef class BitMaskArray: else: arr = self.to_numpy() arr[key] = value - for index, val in enumerate(arr.flatten()): + for val in arr.flatten(): if val: ArrowBitSet(self.validity_buffer, index) else: ArrowBitClear(self.validity_buffer, index) + index += 1 def __getitem__(self, key): if self.parent is not None: - return self.parent.__getitem__(key) + return self.parent[key] elif isinstance(key, int) and key >= 0: return bool(ArrowBitGet(self.validity_buffer, key)) else: @@ -271,7 +276,7 @@ cdef class BitMaskArray: def __or__(self, other): if self.parent is not None: - return self.parent.__or__(other) + return self.parent | other elif isinstance(other, type(self)): return self.to_numpy() | other.to_numpy() else: @@ -284,8 +289,9 @@ cdef class BitMaskArray: def to_numpy(self) -> ndarray: if self.parent is not None: return self.parent.to_numpy() - cdef ndarray[uint8_t] result - result = np.empty(self.array_size, dtype=bool) + + cdef int i + cdef ndarray[uint8_t] result = np.empty(self.array_size, dtype=bool) for i in range(self.array_size): result[i] = self[i] diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 76903074c763d..897ae8a89c73c 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -32,7 +32,6 @@ import pyarrow - from pandas._libs.arrays import BitMaskArray from pandas._typing import ( Dtype, DtypeObj, @@ -233,7 +232,7 @@ class NumericArray(BaseMaskedArray): def __init__( self, values: np.ndarray, - mask: npt.NDArray[np.bool_] | BitMaskArray, + mask: npt.NDArray[np.bool_], copy: bool = False, ) -> None: checker = self._dtype_cls._checker From 3f60cd072fbdc19c5ea240d93306ca93dc123be7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 14 Aug 2023 08:33:38 -0400 Subject: [PATCH 023/126] perf boost --- pandas/_libs/arrays.pyx | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index c2b0b724b34cd..6e67f15616926 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -214,24 +214,33 @@ def _unpickle_bitmaskarray(array): cdef class BitMaskArray: cdef: - int array_size + Py_ssize_t array_size + Py_ssize_t array_nbytes object array_shape uint8_t* validity_buffer cdef public: - int nbytes object parent + cdef _setitem_with_integral(self, const int key, const uint8_t value): + if value: + ArrowBitSet(self.validity_buffer, key) + else: + ArrowBitClear(self.validity_buffer, key) + + cdef void init_from_ndarray(self, const uint8_t[:] arr): + cdef Py_ssize_t i, arrlen + self.array_size = arr.size + self.array_shape = arr.shape + self.array_nbytes = self.array_size // 8 + 1 + self.validity_buffer = malloc(self.array_nbytes) + arrlen = len(arr) + for i in range(arrlen): + self._setitem_with_integral(i, arr[i]) + def __cinit__(self, data): self.parent = None - cdef int index = 0 if isinstance(data, np.ndarray): - self.array_size = data.size - self.array_shape = data.shape - self.nbytes = self.array_size // 8 + 1 - self.validity_buffer = malloc(self.nbytes) - for value in data.flatten(): - self[index] = value - index += 1 + self.init_from_ndarray(data.flatten()) elif isinstance(data, type(self)): self.parent = data # other attributes are undefined when a parent exists @@ -247,10 +256,7 @@ cdef class BitMaskArray: if self.parent is not None: self.parent[key] = value elif isinstance(key, int) and key >= 0: - if value: - ArrowBitSet(self.validity_buffer, key) - else: - ArrowBitClear(self.validity_buffer, key) + self._setitem_with_integral(key, bool(value)) else: arr = self.to_numpy() arr[key] = value @@ -286,6 +292,10 @@ cdef class BitMaskArray: object_state = (self.to_numpy(),) return (_unpickle_bitmaskarray, object_state) + @property + def nbytes(self) -> int: + return self.array_nbytes + def to_numpy(self) -> ndarray: if self.parent is not None: return self.parent.to_numpy() From 3b8921a76e19553ab88db8943c9a267bf5678a37 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 14 Aug 2023 08:43:23 -0400 Subject: [PATCH 024/126] perf boost --- pandas/_libs/arrays.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 6e67f15616926..8f2e3bcbdeccb 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -230,7 +230,6 @@ cdef class BitMaskArray: cdef void init_from_ndarray(self, const uint8_t[:] arr): cdef Py_ssize_t i, arrlen self.array_size = arr.size - self.array_shape = arr.shape self.array_nbytes = self.array_size // 8 + 1 self.validity_buffer = malloc(self.array_nbytes) arrlen = len(arr) @@ -240,7 +239,8 @@ cdef class BitMaskArray: def __cinit__(self, data): self.parent = None if isinstance(data, np.ndarray): - self.init_from_ndarray(data.flatten()) + self.array_shape = data.shape + self.init_from_ndarray(data.ravel()) elif isinstance(data, type(self)): self.parent = data # other attributes are undefined when a parent exists @@ -260,7 +260,7 @@ cdef class BitMaskArray: else: arr = self.to_numpy() arr[key] = value - for val in arr.flatten(): + for val in arr.ravel(): if val: ArrowBitSet(self.validity_buffer, index) else: From 999e7436c85a614b8e00f13115383b6c779b12f6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 14 Aug 2023 09:30:27 -0400 Subject: [PATCH 025/126] more performance --- pandas/_libs/arrays.pyx | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 8f2e3bcbdeccb..e3c384449a6bc 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -221,20 +221,24 @@ cdef class BitMaskArray: cdef public: object parent - cdef _setitem_with_integral(self, const int key, const uint8_t value): + cdef void _setitem_integral(self, const int key, const uint8_t value): if value: ArrowBitSet(self.validity_buffer, key) else: ArrowBitClear(self.validity_buffer, key) + cdef uint8_t _getitem_integral(self, const Py_ssize_t index): + return ArrowBitGet(self.validity_buffer, index) + + @cython.boundscheck(False) + @cython.wraparound(False) cdef void init_from_ndarray(self, const uint8_t[:] arr): - cdef Py_ssize_t i, arrlen - self.array_size = arr.size + cdef Py_ssize_t i + self.array_size = arr.shape[0] self.array_nbytes = self.array_size // 8 + 1 self.validity_buffer = malloc(self.array_nbytes) - arrlen = len(arr) - for i in range(arrlen): - self._setitem_with_integral(i, arr[i]) + for i in range(self.array_size): + self._setitem_integral(i, arr[i]) def __cinit__(self, data): self.parent = None @@ -252,26 +256,28 @@ cdef class BitMaskArray: free(self.validity_buffer) def __setitem__(self, key, value): - cdef int index = 0 + cdef const uint8_t[:] arr1d + cdef Py_ssize_t i = 0 + if self.parent is not None: self.parent[key] = value elif isinstance(key, int) and key >= 0: - self._setitem_with_integral(key, bool(value)) + self._setitem_integral(key, bool(value)) else: arr = self.to_numpy() arr[key] = value - for val in arr.ravel(): - if val: - ArrowBitSet(self.validity_buffer, index) + arr1d = arr.ravel() + for i in range(arr1d.shape[0]): + if arr1d[i]: + ArrowBitSet(self.validity_buffer, i) else: - ArrowBitClear(self.validity_buffer, index) - index += 1 + ArrowBitClear(self.validity_buffer, i) def __getitem__(self, key): if self.parent is not None: return self.parent[key] elif isinstance(key, int) and key >= 0: - return bool(ArrowBitGet(self.validity_buffer, key)) + return self._getitem_integral(key) else: return self.to_numpy()[key] @@ -300,9 +306,9 @@ cdef class BitMaskArray: if self.parent is not None: return self.parent.to_numpy() - cdef int i + cdef Py_ssize_t i cdef ndarray[uint8_t] result = np.empty(self.array_size, dtype=bool) for i in range(self.array_size): - result[i] = self[i] + result[i] = self._getitem_integral(i) return result.reshape(self.array_shape) From 2b764ce38cabe844622ed763e2095c16da8941bd Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 14 Aug 2023 11:06:32 -0400 Subject: [PATCH 026/126] better perf --- pandas/_libs/arrays.pyx | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index e3c384449a6bc..251e127ebef6a 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -24,8 +24,7 @@ from libc.stdlib cimport ( cdef extern from "pandas/vendored/nanoarrow.h": int8_t ArrowBitGet(const uint8_t*, int64_t) - void ArrowBitSet(uint8_t*, int64_t) - void ArrowBitClear(uint8_t*, int64_t) + void ArrowBitSetTo(uint8_t*, int64_t, uint8_t) @cython.freelist(16) @@ -221,15 +220,6 @@ cdef class BitMaskArray: cdef public: object parent - cdef void _setitem_integral(self, const int key, const uint8_t value): - if value: - ArrowBitSet(self.validity_buffer, key) - else: - ArrowBitClear(self.validity_buffer, key) - - cdef uint8_t _getitem_integral(self, const Py_ssize_t index): - return ArrowBitGet(self.validity_buffer, index) - @cython.boundscheck(False) @cython.wraparound(False) cdef void init_from_ndarray(self, const uint8_t[:] arr): @@ -238,7 +228,7 @@ cdef class BitMaskArray: self.array_nbytes = self.array_size // 8 + 1 self.validity_buffer = malloc(self.array_nbytes) for i in range(self.array_size): - self._setitem_integral(i, arr[i]) + ArrowBitSetTo(self.validity_buffer, i, arr[i]) def __cinit__(self, data): self.parent = None @@ -262,22 +252,19 @@ cdef class BitMaskArray: if self.parent is not None: self.parent[key] = value elif isinstance(key, int) and key >= 0: - self._setitem_integral(key, bool(value)) + ArrowBitSetTo(self.validity_buffer, key, bool(value)) else: arr = self.to_numpy() arr[key] = value arr1d = arr.ravel() for i in range(arr1d.shape[0]): - if arr1d[i]: - ArrowBitSet(self.validity_buffer, i) - else: - ArrowBitClear(self.validity_buffer, i) + ArrowBitSetTo(self.validity_buffer, i, arr1d[i]) def __getitem__(self, key): if self.parent is not None: return self.parent[key] elif isinstance(key, int) and key >= 0: - return self._getitem_integral(key) + return ArrowBitGet(self.validity_buffer, key) else: return self.to_numpy()[key] @@ -309,6 +296,6 @@ cdef class BitMaskArray: cdef Py_ssize_t i cdef ndarray[uint8_t] result = np.empty(self.array_size, dtype=bool) for i in range(self.array_size): - result[i] = self._getitem_integral(i) + result[i] = ArrowBitGet(self.validity_buffer, i) return result.reshape(self.array_shape) From 74548e857958300ab979f6bb8c69640d35ef9798 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 14 Aug 2023 12:03:38 -0400 Subject: [PATCH 027/126] code and typing cleanups --- pandas/_libs/arrays.pyi | 22 +++++++++++++++++++++- pandas/_libs/arrays.pyx | 15 ++++++++++----- pandas/core/arrays/masked.py | 10 +++++----- 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 86f69c3cdfc75..632e08002e9bd 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -1,10 +1,16 @@ -from typing import Sequence +from typing import ( + Callable, + Sequence, + Tuple, +) import numpy as np from pandas._typing import ( + ArrayLike, AxisInt, DtypeObj, + PositionalIndexer, Self, Shape, ) @@ -38,3 +44,17 @@ class NDArrayBacked: def _concat_same_type( cls, to_concat: Sequence[Self], axis: AxisInt = ... ) -> Self: ... + +class BitMaskArray: + parent: Self + def __cinit__(self, data: np.ndarray | Self) -> None: ... + def __init__(self, data: np.ndarray | Self) -> None: ... + def __dealloc__(self) -> None: ... + def __setitem__(self, key: PositionalIndexer, value: ArrayLike | bool) -> None: ... + def __getitem__(self, key: PositionalIndexer) -> bool: ... + def __invert__(self) -> np.ndarray: ... + def __or__(self, other: np.ndarray | Self) -> np.ndarray: ... + def __reduce__(self) -> Tuple[Callable[[np.ndarray], Self], Tuple[np.ndarray]]: ... + @property + def nbytes(self) -> int: ... + def to_numpy(self) -> np.ndarray: ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 251e127ebef6a..050360805a441 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -251,7 +251,7 @@ cdef class BitMaskArray: if self.parent is not None: self.parent[key] = value - elif isinstance(key, int) and key >= 0: + elif isinstance(key, int) and key >= 0 and key < self.array_size: ArrowBitSetTo(self.validity_buffer, key, bool(value)) else: arr = self.to_numpy() @@ -263,7 +263,7 @@ cdef class BitMaskArray: def __getitem__(self, key): if self.parent is not None: return self.parent[key] - elif isinstance(key, int) and key >= 0: + elif isinstance(key, int) and key >= 0 and key < self.array_size: return ArrowBitGet(self.validity_buffer, key) else: return self.to_numpy()[key] @@ -289,13 +289,18 @@ cdef class BitMaskArray: def nbytes(self) -> int: return self.array_nbytes + @cython.boundscheck(False) + @cython.wraparound(False) + cdef void convert_to_boolean_array(self, uint8_t[:] out): + cdef Py_ssize_t i + for i in range(self.array_size): + out[i] = ArrowBitGet(self.validity_buffer, i) + def to_numpy(self) -> ndarray: if self.parent is not None: return self.parent.to_numpy() - cdef Py_ssize_t i cdef ndarray[uint8_t] result = np.empty(self.array_size, dtype=bool) - for i in range(self.array_size): - result[i] = ArrowBitGet(self.validity_buffer, i) + self.convert_to_boolean_array(result) return result.reshape(self.array_shape) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 04952f8d8ff3c..f1da44ef4e3da 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -567,7 +567,7 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: # not directly depending on the `copy` keyword mask = self._mask if data is self._data else self._mask.to_numpy().copy() cls = dtype.construct_array_type() - return cls(data, mask, copy=False) + return cls(data, mask, copy=False) # type: ignore[arg-type] if isinstance(dtype, ExtensionDtype): eacls = dtype.construct_array_type() @@ -1395,8 +1395,8 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): # bool, int, float, complex, str, bytes, # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" np.putmask( - values, self._mask.to_numpy(), self._falsey_value - ) # type: ignore[arg-type] + values, self._mask.to_numpy(), self._falsey_value # type: ignore[arg-type] + ) result = values.any() if skipna: return result @@ -1478,8 +1478,8 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): # bool, int, float, complex, str, bytes, # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" np.putmask( - values, self._mask.to_numpy(), self._truthy_value - ) # type: ignore[arg-type] + values, self._mask.to_numpy(), self._truthy_value # type: ignore[arg-type] + ) result = values.all(axis=axis) if skipna: From dce8002fc76d0ff902a8a80064b59d3dd00477c4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 14 Aug 2023 15:41:36 -0400 Subject: [PATCH 028/126] refactor and lower level invert/or implementation --- pandas/_libs/arrays.pyx | 112 +++++++++++++++++++++++------------ pandas/core/arrays/masked.py | 2 +- 2 files changed, 76 insertions(+), 38 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 050360805a441..0da5d775ed948 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -211,14 +211,27 @@ def _unpickle_bitmaskarray(array): return bma +cdef void buf_invert(uint8_t* dest, uint8_t* src, Py_ssize_t size): + cdef Py_ssize_t i + for i in range(size): + dest[i] = ~src[i] + + +cdef void buf_or(uint8_t* dest, uint8_t* src1, uint8_t* src2, Py_ssize_t size): + cdef Py_ssize_t i + for i in range(size): + dest[i] = src1[i] | src2[i] + + cdef class BitMaskArray: cdef: Py_ssize_t array_size Py_ssize_t array_nbytes - object array_shape uint8_t* validity_buffer + bint buffer_owner # set when parent is None, but gives C-level access cdef public: - object parent + object array_shape + object parent # assignments gives RC to ensure proper buffer lifecycle @cython.boundscheck(False) @cython.wraparound(False) @@ -227,57 +240,84 @@ cdef class BitMaskArray: self.array_size = arr.shape[0] self.array_nbytes = self.array_size // 8 + 1 self.validity_buffer = malloc(self.array_nbytes) + self.buffer_owner = True for i in range(self.array_size): ArrowBitSetTo(self.validity_buffer, i, arr[i]) + cdef void init_from_bitmaskarray(self, BitMaskArray bma): + self.buffer_owner = False + self.array_size = bma.array_size + self.array_nbytes = bma.array_nbytes + self.validity_buffer = bma.validity_buffer + def __cinit__(self, data): - self.parent = None if isinstance(data, np.ndarray): - self.array_shape = data.shape self.init_from_ndarray(data.ravel()) + self.array_shape = data.shape + self.parent = None elif isinstance(data, type(self)): + self.init_from_bitmaskarray(data) + self.array_shape = data.array_shape self.parent = data - # other attributes are undefined when a parent exists else: raise TypeError("Unsupported argument to BitMaskArray constructor") def __dealloc__(self): - if not self.parent: + if self.buffer_owner: free(self.validity_buffer) def __setitem__(self, key, value): cdef const uint8_t[:] arr1d cdef Py_ssize_t i = 0 - - if self.parent is not None: - self.parent[key] = value - elif isinstance(key, int) and key >= 0 and key < self.array_size: - ArrowBitSetTo(self.validity_buffer, key, bool(value)) - else: - arr = self.to_numpy() - arr[key] = value - arr1d = arr.ravel() - for i in range(arr1d.shape[0]): - ArrowBitSetTo(self.validity_buffer, i, arr1d[i]) + cdef Py_ssize_t ckey + cdef bint cvalue + + if isinstance(key, int): + ckey = key + cvalue = value + if ckey >= 0 and ckey < self.array_size: + ArrowBitSetTo(self.validity_buffer, ckey, cvalue) + return + + arr = self.to_numpy() + arr[key] = value + arr1d = arr.ravel() + for i in range(arr1d.shape[0]): + ArrowBitSetTo(self.validity_buffer, i, arr1d[i]) def __getitem__(self, key): - if self.parent is not None: - return self.parent[key] - elif isinstance(key, int) and key >= 0 and key < self.array_size: - return ArrowBitGet(self.validity_buffer, key) - else: - return self.to_numpy()[key] + cdef Py_ssize_t ckey + if isinstance(key, int): + ckey = key + if ckey >= 0 and ckey < self.array_size: + return ArrowBitGet(self.validity_buffer, ckey) + + return self.to_numpy()[key] def __invert__(self): - if self.parent is not None: - return ~self.parent - return ~self.to_numpy() + cdef ndarray[uint8_t] result + result = np.empty(self.array_size, dtype=bool) + + cdef uint8_t* inverted = malloc(self.array_size) + buf_invert(inverted, self.validity_buffer, self.array_size) + BitMaskArray.buffer_to_array_1d(result, inverted, self.array_size) + free(inverted) + return result.reshape(self.array_shape) def __or__(self, other): - if self.parent is not None: - return self.parent | other - elif isinstance(other, type(self)): - return self.to_numpy() | other.to_numpy() + cdef ndarray[uint8_t] result + cdef uint8_t* ored + cdef BitMaskArray other_buf + if isinstance(other, type(self)): + other_buf = other + result = np.empty(self.array_size, dtype=bool) + ored = malloc(self.array_size) + buf_or( + ored, self.validity_buffer, other_buf.validity_buffer, self.array_size + ) + BitMaskArray.buffer_to_array_1d(result, ored, self.array_size) + free(ored) + return result.reshape(self.array_shape) else: return self.to_numpy() | other @@ -291,16 +331,14 @@ cdef class BitMaskArray: @cython.boundscheck(False) @cython.wraparound(False) - cdef void convert_to_boolean_array(self, uint8_t[:] out): + @staticmethod + cdef void buffer_to_array_1d(uint8_t[:] out, const uint8_t* buf, Py_ssize_t size): cdef Py_ssize_t i - for i in range(self.array_size): - out[i] = ArrowBitGet(self.validity_buffer, i) + for i in range(size): + out[i] = ArrowBitGet(buf, i) def to_numpy(self) -> ndarray: - if self.parent is not None: - return self.parent.to_numpy() - cdef ndarray[uint8_t] result = np.empty(self.array_size, dtype=bool) - self.convert_to_boolean_array(result) + BitMaskArray.buffer_to_array_1d(result, self.validity_buffer, self.array_size) return result.reshape(self.array_shape) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index f1da44ef4e3da..14a64bd021465 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -700,7 +700,7 @@ def _hasna(self) -> bool: return self._mask.to_numpy().any() # type: ignore[return-value] def _propagate_mask( - self, mask: npt.NDArray[np.bool_] | None, other + self, mask: npt.NDArray[np.bool_] | BitMaskArray | None, other ) -> npt.NDArray[np.bool_]: if mask is None: mask = ( From e641fedaf6f85d301ae64ec886d1979e048c678c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 14 Aug 2023 17:05:12 -0400 Subject: [PATCH 029/126] Mass append nanoarrow for buffer performance --- pandas/_libs/arrays.pyx | 48 +++++++++++++++++++-------- pandas/_libs/src/vendored/nanoarrow.c | 8 ++--- 2 files changed, 38 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 0da5d775ed948..005b21ceed520 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -23,6 +23,18 @@ from libc.stdlib cimport ( cdef extern from "pandas/vendored/nanoarrow.h": + struct ArrowBuffer: + uint8_t* data + int64_t size_bytes + + struct ArrowBitmap: + ArrowBuffer buffer + int64_t size_bits + + void ArrowBitmapInit(ArrowBitmap*) + void ArrowBitmapReserve(ArrowBitmap*, int64_t) + void ArrowBitmapAppendInt8Unsafe(ArrowBitmap*, const int8_t *, int64_t) + void ArrowBitmapReset(ArrowBitmap*) int8_t ArrowBitGet(const uint8_t*, int64_t) void ArrowBitSetTo(uint8_t*, int64_t, uint8_t) @@ -227,7 +239,7 @@ cdef class BitMaskArray: cdef: Py_ssize_t array_size Py_ssize_t array_nbytes - uint8_t* validity_buffer + ArrowBitmap bitmap bint buffer_owner # set when parent is None, but gives C-level access cdef public: object array_shape @@ -235,20 +247,21 @@ cdef class BitMaskArray: @cython.boundscheck(False) @cython.wraparound(False) - cdef void init_from_ndarray(self, const uint8_t[:] arr): - cdef Py_ssize_t i + cdef void init_from_ndarray(self, const uint8_t[::1] arr): + cdef ArrowBitmap bitmap self.array_size = arr.shape[0] self.array_nbytes = self.array_size // 8 + 1 - self.validity_buffer = malloc(self.array_nbytes) + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self.array_size) + ArrowBitmapAppendInt8Unsafe(&bitmap, &arr[0], self.array_size) + self.bitmap = bitmap self.buffer_owner = True - for i in range(self.array_size): - ArrowBitSetTo(self.validity_buffer, i, arr[i]) cdef void init_from_bitmaskarray(self, BitMaskArray bma): self.buffer_owner = False self.array_size = bma.array_size self.array_nbytes = bma.array_nbytes - self.validity_buffer = bma.validity_buffer + self.bitmap = bma.bitmap def __cinit__(self, data): if isinstance(data, np.ndarray): @@ -264,7 +277,7 @@ cdef class BitMaskArray: def __dealloc__(self): if self.buffer_owner: - free(self.validity_buffer) + ArrowBitmapReset(&self.bitmap) def __setitem__(self, key, value): cdef const uint8_t[:] arr1d @@ -276,21 +289,21 @@ cdef class BitMaskArray: ckey = key cvalue = value if ckey >= 0 and ckey < self.array_size: - ArrowBitSetTo(self.validity_buffer, ckey, cvalue) + ArrowBitSetTo(self.bitmap.buffer.data, ckey, cvalue) return arr = self.to_numpy() arr[key] = value arr1d = arr.ravel() for i in range(arr1d.shape[0]): - ArrowBitSetTo(self.validity_buffer, i, arr1d[i]) + ArrowBitSetTo(self.bitmap.buffer.data, i, arr1d[i]) def __getitem__(self, key): cdef Py_ssize_t ckey if isinstance(key, int): ckey = key if ckey >= 0 and ckey < self.array_size: - return ArrowBitGet(self.validity_buffer, ckey) + return ArrowBitGet(self.bitmap.buffer.data, ckey) return self.to_numpy()[key] @@ -299,7 +312,7 @@ cdef class BitMaskArray: result = np.empty(self.array_size, dtype=bool) cdef uint8_t* inverted = malloc(self.array_size) - buf_invert(inverted, self.validity_buffer, self.array_size) + buf_invert(inverted, self.bitmap.buffer.data, self.array_size) BitMaskArray.buffer_to_array_1d(result, inverted, self.array_size) free(inverted) return result.reshape(self.array_shape) @@ -313,7 +326,10 @@ cdef class BitMaskArray: result = np.empty(self.array_size, dtype=bool) ored = malloc(self.array_size) buf_or( - ored, self.validity_buffer, other_buf.validity_buffer, self.array_size + ored, + self.bitmap.buffer.data, + other_buf.bitmap.buffer.data, + self.array_size ) BitMaskArray.buffer_to_array_1d(result, ored, self.array_size) free(ored) @@ -339,6 +355,10 @@ cdef class BitMaskArray: def to_numpy(self) -> ndarray: cdef ndarray[uint8_t] result = np.empty(self.array_size, dtype=bool) - BitMaskArray.buffer_to_array_1d(result, self.validity_buffer, self.array_size) + BitMaskArray.buffer_to_array_1d( + result, + self.bitmap.buffer.data, + self.array_size + ) return result.reshape(self.array_shape) diff --git a/pandas/_libs/src/vendored/nanoarrow.c b/pandas/_libs/src/vendored/nanoarrow.c index 7cc53b43550d7..fc23c71992c4b 100644 --- a/pandas/_libs/src/vendored/nanoarrow.c +++ b/pandas/_libs/src/vendored/nanoarrow.c @@ -22,7 +22,7 @@ #include #include -#include "pandas/vendored/nanoarrow.h" +#include "nanoarrow.h" const char* ArrowNanoarrowVersion(void) { return NANOARROW_VERSION; } @@ -253,7 +253,7 @@ struct ArrowBufferAllocator ArrowBufferDeallocator( #include #include -#include "pandas/vendored/nanoarrow.h" +#include "nanoarrow.h" static void ArrowSchemaRelease(struct ArrowSchema* schema) { if (schema->format != NULL) ArrowFree((void*)schema->format); @@ -1444,7 +1444,7 @@ static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_vi } } -// Helper for bookeeping to emulate sprintf()-like behaviour spread +// Helper for bookkeeping to emulate sprintf()-like behaviour spread // among multiple sprintf calls. static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, int64_t* n_remaining, int64_t* n_chars) { @@ -1777,7 +1777,7 @@ ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, #include #include -#include "pandas/vendored/nanoarrow.h" +#include "nanoarrow.h" static void ArrowArrayRelease(struct ArrowArray* array) { // Release buffers held by this array From 109dd57c9d3abe1bd8125197a49ab9a6db812cb1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 14 Aug 2023 17:19:16 -0400 Subject: [PATCH 030/126] delete duplicative struct members --- pandas/_libs/arrays.pyx | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 005b21ceed520..68148671a4ee7 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -237,8 +237,6 @@ cdef void buf_or(uint8_t* dest, uint8_t* src1, uint8_t* src2, Py_ssize_t size): cdef class BitMaskArray: cdef: - Py_ssize_t array_size - Py_ssize_t array_nbytes ArrowBitmap bitmap bint buffer_owner # set when parent is None, but gives C-level access cdef public: @@ -249,18 +247,17 @@ cdef class BitMaskArray: @cython.wraparound(False) cdef void init_from_ndarray(self, const uint8_t[::1] arr): cdef ArrowBitmap bitmap - self.array_size = arr.shape[0] - self.array_nbytes = self.array_size // 8 + 1 + # As long as we have a 1D arr argument we can use .shape[0] to avoid + # a call to Python via .size + cdef int64_t nobs = arr.shape[0] ArrowBitmapInit(&bitmap) - ArrowBitmapReserve(&bitmap, self.array_size) - ArrowBitmapAppendInt8Unsafe(&bitmap, &arr[0], self.array_size) - self.bitmap = bitmap + ArrowBitmapReserve(&bitmap, nobs) + ArrowBitmapAppendInt8Unsafe(&bitmap, &arr[0], nobs) self.buffer_owner = True + self.bitmap = bitmap cdef void init_from_bitmaskarray(self, BitMaskArray bma): self.buffer_owner = False - self.array_size = bma.array_size - self.array_nbytes = bma.array_nbytes self.bitmap = bma.bitmap def __cinit__(self, data): @@ -288,7 +285,7 @@ cdef class BitMaskArray: if isinstance(key, int): ckey = key cvalue = value - if ckey >= 0 and ckey < self.array_size: + if ckey >= 0 and ckey < self.bitmap.size_bits: ArrowBitSetTo(self.bitmap.buffer.data, ckey, cvalue) return @@ -302,18 +299,18 @@ cdef class BitMaskArray: cdef Py_ssize_t ckey if isinstance(key, int): ckey = key - if ckey >= 0 and ckey < self.array_size: + if ckey >= 0 and ckey < self.bitmap.size_bits: return ArrowBitGet(self.bitmap.buffer.data, ckey) return self.to_numpy()[key] def __invert__(self): cdef ndarray[uint8_t] result - result = np.empty(self.array_size, dtype=bool) + result = np.empty(self.bitmap.size_bits, dtype=bool) - cdef uint8_t* inverted = malloc(self.array_size) - buf_invert(inverted, self.bitmap.buffer.data, self.array_size) - BitMaskArray.buffer_to_array_1d(result, inverted, self.array_size) + cdef uint8_t* inverted = malloc(self.bitmap.size_bits) + buf_invert(inverted, self.bitmap.buffer.data, self.bitmap.size_bits) + BitMaskArray.buffer_to_array_1d(result, inverted, self.bitmap.size_bits) free(inverted) return result.reshape(self.array_shape) @@ -323,15 +320,15 @@ cdef class BitMaskArray: cdef BitMaskArray other_buf if isinstance(other, type(self)): other_buf = other - result = np.empty(self.array_size, dtype=bool) - ored = malloc(self.array_size) + result = np.empty(self.bitmap.size_bits, dtype=bool) + ored = malloc(self.bitmap.size_bits) buf_or( ored, self.bitmap.buffer.data, other_buf.bitmap.buffer.data, - self.array_size + self.bitmap.size_bits ) - BitMaskArray.buffer_to_array_1d(result, ored, self.array_size) + BitMaskArray.buffer_to_array_1d(result, ored, self.bitmap.size_bits) free(ored) return result.reshape(self.array_shape) else: @@ -343,7 +340,7 @@ cdef class BitMaskArray: @property def nbytes(self) -> int: - return self.array_nbytes + return self.bitmap.buffer.size_bytes @cython.boundscheck(False) @cython.wraparound(False) @@ -354,11 +351,11 @@ cdef class BitMaskArray: out[i] = ArrowBitGet(buf, i) def to_numpy(self) -> ndarray: - cdef ndarray[uint8_t] result = np.empty(self.array_size, dtype=bool) + cdef ndarray[uint8_t] result = np.empty(self.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, self.bitmap.buffer.data, - self.array_size + self.bitmap.size_bits ) return result.reshape(self.array_shape) From 35f3b9c6cbc6ac8f17d15e3a3b7134741fd21caf Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 14 Aug 2023 17:25:59 -0400 Subject: [PATCH 031/126] fix pickling --- pandas/_libs/arrays.pyx | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 68148671a4ee7..285af8de74aad 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -218,8 +218,8 @@ cdef class NDArrayBacked: return to_concat[0]._from_backing_data(new_arr) -def _unpickle_bitmaskarray(array): - bma = BitMaskArray(array) +def _unpickle_bitmaskarray(array, parent): + bma = BitMaskArray(array, parent) return bma @@ -260,15 +260,23 @@ cdef class BitMaskArray: self.buffer_owner = False self.bitmap = bma.bitmap - def __cinit__(self, data): + def __cinit__(self, data, parent=None): + # parent is only required to reconstruct ref-counting from pickle + # but should not be called from user code if isinstance(data, np.ndarray): self.init_from_ndarray(data.ravel()) self.array_shape = data.shape - self.parent = None + if parent: + self.parent = parent + else: + self.parent = None elif isinstance(data, type(self)): self.init_from_bitmaskarray(data) self.array_shape = data.array_shape - self.parent = data + if parent: + self.parent = parent + else: + self.parent = data else: raise TypeError("Unsupported argument to BitMaskArray constructor") @@ -335,8 +343,8 @@ cdef class BitMaskArray: return self.to_numpy() | other def __reduce__(self): - object_state = (self.to_numpy(),) - return (_unpickle_bitmaskarray, object_state) + object_state = (self.to_numpy(), self.parent) + return (_unpickle_bitmaskarray, object_state, self.parent) @property def nbytes(self) -> int: From 25e3c51c6a75b6b222425d7cd017f908d020e122 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 14 Aug 2023 17:27:48 -0400 Subject: [PATCH 032/126] nanoarrow typo fixups --- pandas/_libs/include/pandas/vendored/nanoarrow.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/include/pandas/vendored/nanoarrow.h b/pandas/_libs/include/pandas/vendored/nanoarrow.h index 666dea1448326..84440dcbd423f 100644 --- a/pandas/_libs/include/pandas/vendored/nanoarrow.h +++ b/pandas/_libs/include/pandas/vendored/nanoarrow.h @@ -264,7 +264,7 @@ typedef int ArrowErrorCode; /// \ingroup nanoarrow-errors /// /// If nanoarrow was built in debug mode (i.e., defined(NANOARROW_DEBUG) is true), -/// print a message to stderr and abort. If nanoarrow was bulit in release mode, +/// print a message to stderr and abort. If nanoarrow was built in release mode, /// this statement has no effect. You can customize fatal error behaviour /// be defining the NANOARROW_PRINT_AND_DIE macro before including nanoarrow.h /// This macro is provided as a convenience for users and is not used internally. @@ -1252,7 +1252,7 @@ ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, /// Contains more readily extractable values than a raw ArrowSchema. /// Clients can stack or statically allocate this structure but are /// encouraged to use the provided getters to ensure forward -/// compatiblity. +/// compatibility. struct ArrowSchemaView { /// \brief A pointer to the schema represented by this view struct ArrowSchema* schema; @@ -1725,7 +1725,7 @@ ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, /// (i.e. NANOARROW_VALIDATION_LEVEL_NONE or NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU /// buffer data access is not possible or more validation (i.e., /// NANOARROW_VALIDATION_LEVEL_FULL) if buffer content was obtained from an untrusted or -/// corruptable source. +/// corruptible source. ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, enum ArrowValidationLevel validation_level, struct ArrowError* error); @@ -3059,7 +3059,7 @@ static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* arr switch (private_data->storage_type) { case NANOARROW_TYPE_DENSE_UNION: - // Apppend the target child length to the union offsets buffer + // Append the target child length to the union offsets buffer _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0, INT32_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( ArrowArrayBuffer(array, 1), (int32_t)array->children[child_index]->length - 1)); From 82e082e714dfcfc5fa549d739deade2dfef98b15 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 17 Aug 2023 12:41:35 -0400 Subject: [PATCH 033/126] vectorized to_numpy() --- pandas/_libs/arrays.pyx | 13 +++-- .../_libs/include/pandas/vendored/nanoarrow.h | 55 +++++++++++++++++++ 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 285af8de74aad..58d1f4659058a 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -35,6 +35,7 @@ cdef extern from "pandas/vendored/nanoarrow.h": void ArrowBitmapReserve(ArrowBitmap*, int64_t) void ArrowBitmapAppendInt8Unsafe(ArrowBitmap*, const int8_t *, int64_t) void ArrowBitmapReset(ArrowBitmap*) + void ArrowBitsUnpackInt8(const uint8_t*, int64_t, int64_t, int8_t*) int8_t ArrowBitGet(const uint8_t*, int64_t) void ArrowBitSetTo(uint8_t*, int64_t, uint8_t) @@ -223,13 +224,17 @@ def _unpickle_bitmaskarray(array, parent): return bma -cdef void buf_invert(uint8_t* dest, uint8_t* src, Py_ssize_t size): +@cython.boundscheck(False) +@cython.wraparound(False) +cdef void buf_invert(uint8_t* dest, uint8_t* src, Py_ssize_t size) noexcept: cdef Py_ssize_t i for i in range(size): dest[i] = ~src[i] -cdef void buf_or(uint8_t* dest, uint8_t* src1, uint8_t* src2, Py_ssize_t size): +@cython.boundscheck(False) +@cython.wraparound(False) +cdef void buf_or(uint8_t* dest, uint8_t* src1, uint8_t* src2, Py_ssize_t size) noexcept: cdef Py_ssize_t i for i in range(size): dest[i] = src1[i] | src2[i] @@ -354,9 +359,7 @@ cdef class BitMaskArray: @cython.wraparound(False) @staticmethod cdef void buffer_to_array_1d(uint8_t[:] out, const uint8_t* buf, Py_ssize_t size): - cdef Py_ssize_t i - for i in range(size): - out[i] = ArrowBitGet(buf, i) + ArrowBitsUnpackInt8(buf, 0, size, &out[0]) def to_numpy(self) -> ndarray: cdef ndarray[uint8_t] result = np.empty(self.bitmap.size_bits, dtype=bool) diff --git a/pandas/_libs/include/pandas/vendored/nanoarrow.h b/pandas/_libs/include/pandas/vendored/nanoarrow.h index 84440dcbd423f..1ff568416a05a 100644 --- a/pandas/_libs/include/pandas/vendored/nanoarrow.h +++ b/pandas/_libs/include/pandas/vendored/nanoarrow.h @@ -1482,6 +1482,11 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l /// \brief Count true values in a bitmap static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to); + +/// \brief Extract int8 boolean values from a range in a bitmap +static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, + int64_t length, int8_t* out); + /// \brief Initialize an ArrowBitmap /// /// Initialize the builder's buffer, empty its cache, and reset the size to zero @@ -2119,6 +2124,17 @@ static inline int64_t _ArrowBytesForBits(int64_t bits) { return (bits >> 3) + ((bits & 7) != 0); } +static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) { + out[0] = (word >> 0) & 1; + out[1] = (word >> 1) & 1; + out[2] = (word >> 2) & 1; + out[3] = (word >> 3) & 1; + out[4] = (word >> 4) & 1; + out[5] = (word >> 5) & 1; + out[6] = (word >> 6) & 1; + out[7] = (word >> 7) & 1; +} + static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | values[4] << 4 | values[5] << 5 | values[6] << 6 | values[7] << 7); @@ -2133,6 +2149,45 @@ static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { return (bits[i >> 3] >> (i & 0x07)) & 1; } +static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, + int64_t length, int8_t* out) { + if (length == 0) { + return; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + return; + } + + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt8(bits[i], out); + out += 8; + } + + // last byte + const int bits_remaining = i_end % 8 == 0 ? 8 : i_end % 8; + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } +} + static inline void ArrowBitSet(uint8_t* bits, int64_t i) { bits[i / 8] |= _ArrowkBitmask[i % 8]; } From c140af4a5f1445118461ddc4f4b1310f17067925 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 17 Aug 2023 12:53:42 -0400 Subject: [PATCH 034/126] sum impl --- pandas/_libs/arrays.pyx | 4 ++++ pandas/core/arrays/masked.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 58d1f4659058a..536597c514646 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -38,6 +38,7 @@ cdef extern from "pandas/vendored/nanoarrow.h": void ArrowBitsUnpackInt8(const uint8_t*, int64_t, int64_t, int8_t*) int8_t ArrowBitGet(const uint8_t*, int64_t) void ArrowBitSetTo(uint8_t*, int64_t, uint8_t) + int64_t ArrowBitCountSet(const uint8_t*, int64_t, int64_t) @cython.freelist(16) @@ -355,6 +356,9 @@ cdef class BitMaskArray: def nbytes(self) -> int: return self.bitmap.buffer.size_bytes + def sum(self) -> bool: + return ArrowBitCountSet(self.bitmap.buffer.data, 0, self.bitmap.size_bits) + @cython.boundscheck(False) @cython.wraparound(False) @staticmethod diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 14a64bd021465..5fc01ee1c6ea3 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1092,7 +1092,7 @@ def value_counts(self, dropna: bool = True) -> Series: # if we want nans, count the mask counts = np.empty(len(value_counts) + 1, dtype="int64") counts[:-1] = value_counts - counts[-1] = self._mask.to_numpy().sum() + counts[-1] = self._mask.sum() index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value) index = index.astype(self.dtype) From 86ce6569987bb2d526383de6d6074f676cf85a40 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 17 Aug 2023 13:01:08 -0400 Subject: [PATCH 035/126] any impl --- pandas/_libs/arrays.pyx | 6 ++++++ pandas/core/arrays/masked.py | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 536597c514646..a9302719a02fa 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -356,6 +356,12 @@ cdef class BitMaskArray: def nbytes(self) -> int: return self.bitmap.buffer.size_bytes + def any(self) -> bool: + # TODO: we might want to create a short circuiting implementation in + # nanoarrow, but even with a complete sum this is cheaper than + # serializing to numpy for an any call + return ArrowBitCountSet(self.bitmap.buffer.data, 0, self.bitmap.size_bits) > 0 + def sum(self) -> bool: return ArrowBitCountSet(self.bitmap.buffer.data, 0, self.bitmap.size_bits) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 5fc01ee1c6ea3..e2474707949a4 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -676,7 +676,7 @@ def reconstruct(x: np.ndarray): return tuple(reconstruct(x) for x in result) elif method == "reduce": # e.g. np.add.reduce; test_ufunc_reduce_raises - if self._mask.to_numpy().any(): + if self._mask.any(): return self._na_value return result else: @@ -697,7 +697,7 @@ def _hasna(self) -> bool: # source code using it.. # error: Incompatible return value type (got "bool_", expected "bool") - return self._mask.to_numpy().any() # type: ignore[return-value] + return self._mask.any() # type: ignore[return-value] def _propagate_mask( self, mask: npt.NDArray[np.bool_] | BitMaskArray | None, other @@ -1401,7 +1401,7 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): if skipna: return result else: - if result or len(self) == 0 or not self._mask.to_numpy().any(): + if result or len(self) == 0 or not self._mask.any(): return result else: return self.dtype.na_value @@ -1485,7 +1485,7 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): if skipna: return result else: - if not result or len(self) == 0 or not self._mask.to_numpy().any(): + if not result or len(self) == 0 or not self._mask.any(): return result else: return self.dtype.na_value From 03b16611e23eaa5c38d98a5143a76a049eb87822 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 17 Aug 2023 13:46:43 -0400 Subject: [PATCH 036/126] updated cython typing --- pandas/_libs/arrays.pyi | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 81398fe0c893e..55347109a8ff2 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -47,9 +47,7 @@ class NDArrayBacked: class BitMaskArray: parent: Self - def __cinit__(self, data: np.ndarray | Self) -> None: ... def __init__(self, data: np.ndarray | Self) -> None: ... - def __dealloc__(self) -> None: ... def __setitem__(self, key: PositionalIndexer, value: ArrayLike | bool) -> None: ... def __getitem__(self, key: PositionalIndexer) -> bool: ... def __invert__(self) -> np.ndarray: ... From e9d4da4c47d8bb9a3e11ef37892098e10d43ee94 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 17 Aug 2023 16:24:36 -0400 Subject: [PATCH 037/126] remove bad __or__ impl --- pandas/_libs/arrays.pyx | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index a9302719a02fa..8dc38cdc44d3c 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -232,15 +232,6 @@ cdef void buf_invert(uint8_t* dest, uint8_t* src, Py_ssize_t size) noexcept: for i in range(size): dest[i] = ~src[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -cdef void buf_or(uint8_t* dest, uint8_t* src1, uint8_t* src2, Py_ssize_t size) noexcept: - cdef Py_ssize_t i - for i in range(size): - dest[i] = src1[i] | src2[i] - - cdef class BitMaskArray: cdef: ArrowBitmap bitmap @@ -323,30 +314,14 @@ cdef class BitMaskArray: result = np.empty(self.bitmap.size_bits, dtype=bool) cdef uint8_t* inverted = malloc(self.bitmap.size_bits) - buf_invert(inverted, self.bitmap.buffer.data, self.bitmap.size_bits) + # TODO: upstream invert or make sure we handle size == 0 here + buf_invert(inverted, self.bitmap.buffer.data, self.bitmap.size_bits // 8 + 1) BitMaskArray.buffer_to_array_1d(result, inverted, self.bitmap.size_bits) free(inverted) return result.reshape(self.array_shape) def __or__(self, other): - cdef ndarray[uint8_t] result - cdef uint8_t* ored - cdef BitMaskArray other_buf - if isinstance(other, type(self)): - other_buf = other - result = np.empty(self.bitmap.size_bits, dtype=bool) - ored = malloc(self.bitmap.size_bits) - buf_or( - ored, - self.bitmap.buffer.data, - other_buf.bitmap.buffer.data, - self.bitmap.size_bits - ) - BitMaskArray.buffer_to_array_1d(result, ored, self.bitmap.size_bits) - free(ored) - return result.reshape(self.array_shape) - else: - return self.to_numpy() | other + return self.to_numpy() | other def __reduce__(self): object_state = (self.to_numpy(), self.parent) From 1993e969137bcbcd193cb25ac4d6b272fc781a6e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 17 Aug 2023 16:51:41 -0400 Subject: [PATCH 038/126] fix __or__ --- pandas/_libs/arrays.pyx | 3 +++ pandas/core/arrays/masked.py | 4 +--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 8dc38cdc44d3c..5a1f3cc3109f4 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -321,6 +321,9 @@ cdef class BitMaskArray: return result.reshape(self.array_shape) def __or__(self, other): + if isinstance(other, type(self)): + return self.to_numpy() | other.to_numpy() + return self.to_numpy() | other def __reduce__(self): diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e2474707949a4..7d82760c7faa3 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -695,9 +695,7 @@ def _hasna(self) -> bool: # Note: this is expensive right now! The hope is that we can # make this faster by having an optional mask, but not have to change # source code using it.. - - # error: Incompatible return value type (got "bool_", expected "bool") - return self._mask.any() # type: ignore[return-value] + return self._mask.any() def _propagate_mask( self, mask: npt.NDArray[np.bool_] | BitMaskArray | None, other From 10ce5ca01fbcbcf6e000c90dd71073538afa592b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 17 Aug 2023 17:29:31 -0400 Subject: [PATCH 039/126] removed faulty inversion --- pandas/_libs/arrays.pyx | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 5a1f3cc3109f4..09ce24cceb10e 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -16,12 +16,6 @@ from numpy cimport ( cnp.import_array() -from libc.stdlib cimport ( - free, - malloc, -) - - cdef extern from "pandas/vendored/nanoarrow.h": struct ArrowBuffer: uint8_t* data @@ -225,13 +219,6 @@ def _unpickle_bitmaskarray(array, parent): return bma -@cython.boundscheck(False) -@cython.wraparound(False) -cdef void buf_invert(uint8_t* dest, uint8_t* src, Py_ssize_t size) noexcept: - cdef Py_ssize_t i - for i in range(size): - dest[i] = ~src[i] - cdef class BitMaskArray: cdef: ArrowBitmap bitmap @@ -310,15 +297,7 @@ cdef class BitMaskArray: return self.to_numpy()[key] def __invert__(self): - cdef ndarray[uint8_t] result - result = np.empty(self.bitmap.size_bits, dtype=bool) - - cdef uint8_t* inverted = malloc(self.bitmap.size_bits) - # TODO: upstream invert or make sure we handle size == 0 here - buf_invert(inverted, self.bitmap.buffer.data, self.bitmap.size_bits // 8 + 1) - BitMaskArray.buffer_to_array_1d(result, inverted, self.bitmap.size_bits) - free(inverted) - return result.reshape(self.array_shape) + return ~self.to_numpy() def __or__(self, other): if isinstance(other, type(self)): From e8b7819d41582b5228361548afbaafcfe2dd7165 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 17 Aug 2023 18:27:14 -0400 Subject: [PATCH 040/126] more performant bit unpacking --- pandas/_libs/include/pandas/vendored/nanoarrow.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/include/pandas/vendored/nanoarrow.h b/pandas/_libs/include/pandas/vendored/nanoarrow.h index 1ff568416a05a..aad56a29eb5ab 100644 --- a/pandas/_libs/include/pandas/vendored/nanoarrow.h +++ b/pandas/_libs/include/pandas/vendored/nanoarrow.h @@ -2125,14 +2125,14 @@ static inline int64_t _ArrowBytesForBits(int64_t bits) { } static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) { - out[0] = (word >> 0) & 1; - out[1] = (word >> 1) & 1; - out[2] = (word >> 2) & 1; - out[3] = (word >> 3) & 1; - out[4] = (word >> 4) & 1; - out[5] = (word >> 5) & 1; - out[6] = (word >> 6) & 1; - out[7] = (word >> 7) & 1; + out[0] = (word & 0x1) != 0; + out[1] = (word & 0x2) != 0; + out[2] = (word & 0x4) != 0; + out[3] = (word & 0x8) != 0; + out[4] = (word & 0x10) != 0; + out[5] = (word & 0x20) != 0; + out[6] = (word & 0x40) != 0; + out[7] = (word & 0x80) != 0; } static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { From 9fdb65294df59ee9094db5879cbd1ff174b35975 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 18 Aug 2023 10:19:03 -0400 Subject: [PATCH 041/126] try non-shift nanoarrow packing --- pandas/_libs/include/pandas/vendored/nanoarrow.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/include/pandas/vendored/nanoarrow.h b/pandas/_libs/include/pandas/vendored/nanoarrow.h index aad56a29eb5ab..30fcf04008eba 100644 --- a/pandas/_libs/include/pandas/vendored/nanoarrow.h +++ b/pandas/_libs/include/pandas/vendored/nanoarrow.h @@ -2136,8 +2136,15 @@ static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) { } static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { - *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | values[4] << 4 | - values[5] << 5 | values[6] << 6 | values[7] << 7); + *out = (values[0] + | ((values[1] + 0x1) & 0x2) + | ((values[2] + 0x3) & 0x4) + | ((values[3] + 0x7) & 0x8) + | ((values[4] + 0xf) & 0x10) + | ((values[5] + 0x1f) & 0x20) + | ((values[6] + 0x3f) & 0x40) + | ((values[7] + 0x7f) & 0x80) + ); } static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) { From 17059cbb287e15aea9e09a887da8f77f04921bca Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 18 Aug 2023 10:44:26 -0400 Subject: [PATCH 042/126] Remove to_numpy + copy chains --- pandas/core/arrays/masked.py | 16 ++++++++-------- pandas/tests/arrays/masked/test_arithmetic.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7d82760c7faa3..dc93e58030dbe 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -416,22 +416,22 @@ def round(self, decimals: int = 0, *args, **kwargs): values = np.round(self._data, decimals=decimals, **kwargs) # Usually we'll get same type as self, but ndarray[bool] casts to float - return self._maybe_mask_result(values, self._mask.to_numpy().copy()) + return self._maybe_mask_result(values, self._mask.to_numpy()) # ------------------------------------------------------------------ # Unary Methods def __invert__(self) -> Self: - return self._simple_new(~self._data, self._mask.to_numpy().copy()) + return self._simple_new(~self._data, self._mask.to_numpy()) def __neg__(self) -> Self: - return self._simple_new(-self._data, self._mask.to_numpy().copy()) + return self._simple_new(-self._data, self._mask.to_numpy()) def __pos__(self) -> Self: return self.copy() def __abs__(self) -> Self: - return self._simple_new(abs(self._data), self._mask.to_numpy().copy()) + return self._simple_new(abs(self._data), self._mask.to_numpy()) # ------------------------------------------------------------------ @@ -565,7 +565,7 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: data = self._data.astype(dtype.numpy_dtype, copy=copy) # mask is copied depending on whether the data was copied, and # not directly depending on the `copy` keyword - mask = self._mask if data is self._data else self._mask.to_numpy().copy() + mask = self._mask if data is self._data else self._mask.to_numpy() cls = dtype.construct_array_type() return cls(data, mask, copy=False) # type: ignore[arg-type] @@ -702,7 +702,7 @@ def _propagate_mask( ) -> npt.NDArray[np.bool_]: if mask is None: mask = ( - self._mask.to_numpy().copy() + self._mask.to_numpy() ) # TODO: need test for BooleanArray needing a copy if other is libmissing.NA: # GH#45421 don't alter inplace @@ -900,7 +900,7 @@ def _maybe_mask_result( return result def isna(self) -> np.ndarray: - return self._mask.to_numpy().copy() + return self._mask.to_numpy() @property def _na_value(self): @@ -982,7 +982,7 @@ def isin(self, values) -> BooleanArray: # type: ignore[override] def copy(self) -> Self: data = self._data.copy() - mask = self._mask.to_numpy().copy() + mask = self._mask.to_numpy() return self._simple_new(data, mask) def unique(self) -> Self: diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py index 21e292e5bbc29..04deac24a9211 100644 --- a/pandas/tests/arrays/masked/test_arithmetic.py +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -76,7 +76,7 @@ def test_array_NA(data, all_arithmetic_operators): scalar = pd.NA scalar_array = pd.array([pd.NA] * len(data), dtype=data.dtype) - mask = data._mask.to_numpy().copy() + mask = data._mask.to_numpy() if is_bool_not_implemented(data, all_arithmetic_operators): msg = "operator '.*' not implemented for bool dtypes" From c5a3584ce1124516def7cae211631dc2c3215430 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 18 Aug 2023 11:34:33 -0400 Subject: [PATCH 043/126] higher performance dunders --- pandas/_libs/arrays.pyx | 139 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 133 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 09ce24cceb10e..ec9f74e723738 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -7,6 +7,10 @@ import numpy as np cimport numpy as cnp from cpython cimport PyErr_Clear +from libc.stdlib cimport ( + free, + malloc, +) from numpy cimport ( int8_t, int64_t, @@ -297,14 +301,87 @@ cdef class BitMaskArray: return self.to_numpy()[key] def __invert__(self): + # TODO: could invert the buffer first then go to numpy return ~self.to_numpy() + def __and__(self, other): + cdef ndarray[uint8_t] result + cdef BitMaskArray other_bma + if isinstance(other, type(self)): + other_bma = other + if self.bitmap.size_bits == 0: + return np.empty(dtype=bool).reshape(self.array_shape) + + buf = malloc(self.bitmap.size_bits) + BitMaskArray.buf_and( + self.bitmap.buffer.data, + other_bma.bitmap.buffer.data, + self.bitmap.size_bits // 8 + 1, + buf + ) + result = np.empty(self.bitmap.size_bits, dtype=bool) + BitMaskArray.buffer_to_array_1d( + result, + buf, + self.bitmap.size_bits + ) + free(buf) + return result.reshape(self.array_shape) + + return self.to_numpy() & other + def __or__(self, other): + cdef ndarray[uint8_t] result + cdef BitMaskArray other_bma if isinstance(other, type(self)): - return self.to_numpy() | other.to_numpy() + other_bma = other + if self.bitmap.size_bits == 0: + return np.empty(dtype=bool).reshape(self.array_shape) + + buf = malloc(self.bitmap.size_bits) + BitMaskArray.buf_or( + self.bitmap.buffer.data, + other_bma.bitmap.buffer.data, + self.bitmap.size_bits // 8 + 1, + buf + ) + result = np.empty(self.bitmap.size_bits, dtype=bool) + BitMaskArray.buffer_to_array_1d( + result, + buf, + self.bitmap.size_bits + ) + free(buf) + return result.reshape(self.array_shape) return self.to_numpy() | other + def __xor__(self, other): + cdef ndarray[uint8_t] result + cdef BitMaskArray other_bma + if isinstance(other, type(self)): + other_bma = other + if self.bitmap.size_bits == 0: + return np.empty(dtype=bool).reshape(self.array_shape) + + buf = malloc(self.bitmap.size_bits) + BitMaskArray.buf_xor( + self.bitmap.buffer.data, + other_bma.bitmap.buffer.data, + self.bitmap.size_bits // 8 + 1, + buf + ) + result = np.empty(self.bitmap.size_bits, dtype=bool) + BitMaskArray.buffer_to_array_1d( + result, + buf, + self.bitmap.size_bits + ) + free(buf) + return result.reshape(self.array_shape) + + return self.to_numpy() ^ other + def __reduce__(self): object_state = (self.to_numpy(), self.parent) return (_unpickle_bitmaskarray, object_state, self.parent) @@ -314,20 +391,70 @@ cdef class BitMaskArray: return self.bitmap.buffer.size_bytes def any(self) -> bool: - # TODO: we might want to create a short circuiting implementation in - # nanoarrow, but even with a complete sum this is cheaper than - # serializing to numpy for an any call - return ArrowBitCountSet(self.bitmap.buffer.data, 0, self.bitmap.size_bits) > 0 + return BitMaskArray.buf_any(self.bitmap.buffer.data, self.bitmap.size_bits) def sum(self) -> bool: return ArrowBitCountSet(self.bitmap.buffer.data, 0, self.bitmap.size_bits) - @cython.boundscheck(False) + @cython.boundscheck(False) # TODO: Removing this causes an IndexError? Zero size? @cython.wraparound(False) @staticmethod cdef void buffer_to_array_1d(uint8_t[:] out, const uint8_t* buf, Py_ssize_t size): ArrowBitsUnpackInt8(buf, 0, size, &out[0]) + @staticmethod + cdef bint buf_any(const uint8_t* buf1, Py_ssize_t nbits): + cdef Py_ssize_t i, nbytes = nbits // 8 + 1, rem = nbits % 8 + if nbits == 0: + return False + + for i in range(nbytes): + if buf1[i] > 0: + return True + + for i in range(rem): + if ArrowBitGet(buf1, nbits - rem): + return True + + return False + + # TODO: clean up signatures - don't mix nbits and nbytes + # Note that in cases where the size_bits doesn't end on a word + # boundary that these will still operate on the remaining bits, + # with undefined values therein + @staticmethod + cdef void buf_or( + const uint8_t* buf1, + const uint8_t* buf2, + Py_ssize_t nbytes, + uint8_t* out + ): + cdef Py_ssize_t i + for i in range(nbytes): + out[i] = buf1[i] | buf2[i] + + @staticmethod + cdef void buf_xor( + const uint8_t* buf1, + const uint8_t* buf2, + Py_ssize_t nbytes, + uint8_t* out + ): + cdef Py_ssize_t i + for i in range(nbytes): + out[i] = buf1[i] ^ buf2[i] + + @staticmethod + cdef void buf_and( + const uint8_t* buf1, + const uint8_t* buf2, + Py_ssize_t nbytes, + uint8_t* out + ): + cdef Py_ssize_t i + for i in range(nbytes): + out[i] = buf1[i] & buf2[i] + def to_numpy(self) -> ndarray: cdef ndarray[uint8_t] result = np.empty(self.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( From 28b589fd90d971063392ec6222d3a67bb91c0918 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 18 Aug 2023 11:43:39 -0400 Subject: [PATCH 044/126] updated typing --- pandas/_libs/arrays.pyi | 4 ++++ pandas/_libs/arrays.pyx | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 55347109a8ff2..528ee56deeecd 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -51,8 +51,12 @@ class BitMaskArray: def __setitem__(self, key: PositionalIndexer, value: ArrayLike | bool) -> None: ... def __getitem__(self, key: PositionalIndexer) -> bool: ... def __invert__(self) -> np.ndarray: ... + def __and__(self, other: np.ndarray | Self) -> np.ndarray: ... def __or__(self, other: np.ndarray | Self) -> np.ndarray: ... + def __xor__(self, other: np.ndarray | Self) -> np.ndarray: ... def __reduce__(self) -> Tuple[Callable[[np.ndarray], Self], Tuple[np.ndarray]]: ... @property def nbytes(self) -> int: ... + def any(self) -> bool: ... + def sum(self) -> int: ... def to_numpy(self) -> np.ndarray: ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index ec9f74e723738..e5ac6d1c92572 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -393,7 +393,7 @@ cdef class BitMaskArray: def any(self) -> bool: return BitMaskArray.buf_any(self.bitmap.buffer.data, self.bitmap.size_bits) - def sum(self) -> bool: + def sum(self) -> int: return ArrowBitCountSet(self.bitmap.buffer.data, 0, self.bitmap.size_bits) @cython.boundscheck(False) # TODO: Removing this causes an IndexError? Zero size? From e3618fbfc190c4ec5d2224e83a7c651163d491a9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 18 Aug 2023 12:10:38 -0400 Subject: [PATCH 045/126] consolidated to_numpy() --- pandas/core/arrays/masked.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index dc93e58030dbe..2d22029c5fd5b 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -197,7 +197,8 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: # TODO: need to change this to special case multiple # indexers versus just scalar - newmask = self._mask.to_numpy()[item] + np_mask = self._mask.to_numpy() + newmask = np_mask[item] if is_bool(newmask): # This is a scalar indexing if newmask: @@ -205,7 +206,7 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self._data[item] # sending self._mask avoids copy of buffer - if np.array_equal(newmask, self._mask.to_numpy()): + if np.array_equal(newmask, np_mask): return self._simple_new(self._data[item], self._mask) return self._simple_new(self._data[item], newmask) From 4c82771407e663993aba947e859a575cf3830c99 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 18 Aug 2023 13:48:25 -0400 Subject: [PATCH 046/126] fixups --- pandas/_libs/arrays.pyx | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index e5ac6d1c92572..4c438bec43a7b 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -402,18 +402,20 @@ cdef class BitMaskArray: cdef void buffer_to_array_1d(uint8_t[:] out, const uint8_t* buf, Py_ssize_t size): ArrowBitsUnpackInt8(buf, 0, size, &out[0]) + @cython.boundscheck(False) + @cython.wraparound(False) @staticmethod - cdef bint buf_any(const uint8_t* buf1, Py_ssize_t nbits): - cdef Py_ssize_t i, nbytes = nbits // 8 + 1, rem = nbits % 8 + cdef bint buf_any(const uint8_t* buf, Py_ssize_t nbits): + cdef Py_ssize_t i, bits_remaining = nbits % 8, size_bytes = nbits // 8 if nbits == 0: return False - for i in range(nbytes): - if buf1[i] > 0: + for i in range(size_bytes): + if buf[i] > 0: return True - for i in range(rem): - if ArrowBitGet(buf1, nbits - rem): + for i in range(bits_remaining): + if ArrowBitGet(buf, nbits - i - 1): return True return False @@ -422,6 +424,8 @@ cdef class BitMaskArray: # Note that in cases where the size_bits doesn't end on a word # boundary that these will still operate on the remaining bits, # with undefined values therein + @cython.boundscheck(False) + @cython.wraparound(False) @staticmethod cdef void buf_or( const uint8_t* buf1, @@ -433,6 +437,8 @@ cdef class BitMaskArray: for i in range(nbytes): out[i] = buf1[i] | buf2[i] + @cython.boundscheck(False) + @cython.wraparound(False) @staticmethod cdef void buf_xor( const uint8_t* buf1, @@ -444,6 +450,8 @@ cdef class BitMaskArray: for i in range(nbytes): out[i] = buf1[i] ^ buf2[i] + @cython.boundscheck(False) + @cython.wraparound(False) @staticmethod cdef void buf_and( const uint8_t* buf1, From 633935d1c368be75f90a5c2a708bae769ad413ad Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Aug 2023 11:25:51 -0400 Subject: [PATCH 047/126] deferred to_numpy() calls in boolean --- pandas/_libs/arrays.pyi | 3 +++ pandas/_libs/arrays.pyx | 10 ++++++++ pandas/core/arrays/boolean.py | 12 +++------ pandas/core/ops/mask_ops.py | 47 ++++++++++++++++++++++++----------- 4 files changed, 49 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 528ee56deeecd..285d96df0f2bb 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -13,6 +13,7 @@ from pandas._typing import ( PositionalIndexer, Self, Shape, + type_t, ) class NDArrayBacked: @@ -57,6 +58,8 @@ class BitMaskArray: def __reduce__(self) -> Tuple[Callable[[np.ndarray], Self], Tuple[np.ndarray]]: ... @property def nbytes(self) -> int: ... + def shape(self) -> tuple[int, ...]: ... + def dtype(self) -> type_t[bool]: ... def any(self) -> bool: ... def sum(self) -> int: ... def to_numpy(self) -> np.ndarray: ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 4c438bec43a7b..47e3fc38a698c 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -390,6 +390,16 @@ cdef class BitMaskArray: def nbytes(self) -> int: return self.bitmap.buffer.size_bytes + @property + def shape(self): + """Strictly for NumPy compat in mask_ops""" + return self.array_shape + + @property + def dtype(self): + """Strictly for NumPy compat in mask_ops""" + return bool + def any(self) -> bool: return BitMaskArray.buf_any(self.bitmap.buffer.data, self.bitmap.size_bits) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 62ae43f529204..06da136a63d86 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -363,7 +363,7 @@ def _logical_method(self, other, op): mask = None if isinstance(other, BooleanArray): - other, mask = other._data, other._mask.to_numpy() + other, mask = other._data, other._mask elif is_list_like(other): other = np.asarray(other, dtype="bool") if other.ndim > 1: @@ -382,16 +382,12 @@ def _logical_method(self, other, op): raise ValueError("Lengths must match") if op.__name__ in {"or_", "ror_"}: - result, mask = ops.kleene_or(self._data, other, self._mask.to_numpy(), mask) + result, mask = ops.kleene_or(self._data, other, self._mask, mask) elif op.__name__ in {"and_", "rand_"}: - result, mask = ops.kleene_and( - self._data, other, self._mask.to_numpy(), mask - ) + result, mask = ops.kleene_and(self._data, other, self._mask, mask) else: # i.e. xor, rxor - result, mask = ops.kleene_xor( - self._data, other, self._mask.to_numpy(), mask - ) + result, mask = ops.kleene_xor(self._data, other, self._mask, mask) # i.e. BooleanArray return self._maybe_mask_result(result, mask) diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index adc1f63c568bf..049a815296a3f 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -9,13 +9,14 @@ lib, missing as libmissing, ) +from pandas._libs.arrays import BitMaskArray def kleene_or( left: bool | np.ndarray | libmissing.NAType, right: bool | np.ndarray | libmissing.NAType, - left_mask: np.ndarray | None, - right_mask: np.ndarray | None, + left_mask: np.ndarray | BitMaskArray | None, + right_mask: np.ndarray | BitMaskArray | None, ): """ Boolean ``or`` using Kleene logic. @@ -53,6 +54,8 @@ def kleene_or( result = left | right if right_mask is not None: + left_mask = left_mask.to_numpy() + right_mask = right_mask.to_numpy() # output is unknown where (False & NA), (NA & False), (NA & NA) left_false = ~(left | left_mask) right_false = ~(right | right_mask) @@ -63,12 +66,13 @@ def kleene_or( ) else: if right is True: - mask = np.zeros_like(left_mask) - elif right is libmissing.NA: - mask = (~left & ~left_mask) | left_mask + mask = np.zeros(left_mask.shape, left_mask.dtype) else: - # False - mask = left_mask.copy() + left_mask = left_mask.to_numpy() + if right is libmissing.NA: + mask = (~left & ~left_mask) | left_mask + else: + mask = left_mask return result, mask @@ -76,8 +80,8 @@ def kleene_or( def kleene_xor( left: bool | np.ndarray | libmissing.NAType, right: bool | np.ndarray | libmissing.NAType, - left_mask: np.ndarray | None, - right_mask: np.ndarray | None, + left_mask: np.ndarray | BitMaskArray | None, + right_mask: np.ndarray | BitMaskArray | None, ): """ Boolean ``xor`` using Kleene logic. @@ -117,9 +121,12 @@ def kleene_xor( if right_mask is None: if right is libmissing.NA: - mask = np.ones_like(left_mask) + mask = np.ones(left_mask.shape, left_mask.dtype) else: - mask = left_mask.copy() + if isinstance(left_mask, BitMaskArray): + mask = left_mask.to_numpy() + else: + mask = left_mask.copy() else: mask = left_mask | right_mask @@ -129,8 +136,8 @@ def kleene_xor( def kleene_and( left: bool | libmissing.NAType | np.ndarray, right: bool | libmissing.NAType | np.ndarray, - left_mask: np.ndarray | None, - right_mask: np.ndarray | None, + left_mask: np.ndarray | BitMaskArray | None, + right_mask: np.ndarray | BitMaskArray | None, ): """ Boolean ``and`` using Kleene logic. @@ -166,16 +173,26 @@ def kleene_and( result = left & right if right_mask is None: + if isinstance(left_mask, BitMaskArray): + left_mask = left_mask.to_numpy() + # Scalar `right` if right is libmissing.NA: mask = (left & ~left_mask) | left_mask - else: - mask = left_mask.copy() + if not isinstance(left_mask, BitMaskArray): # already a copy + mask = left_mask.copy() if right is False: # unmask everything mask[:] = False else: + # TODO: Cython 3 changed support for radd / ror methods and may + # not be working? For now convert to NumPy + if isinstance(left_mask, BitMaskArray): + left_mask = left_mask.to_numpy() + if isinstance(right_mask, BitMaskArray): + right_mask = right_mask.to_numpy() + # unmask where either left or right is False left_false = ~(left | left_mask) right_false = ~(right | right_mask) From 6ed2c55c23328d1dd509cda8ffc6ee5573706358 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Aug 2023 12:58:35 -0400 Subject: [PATCH 048/126] test fix --- pandas/core/ops/mask_ops.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index 049a815296a3f..a6dbb1db28af2 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -54,8 +54,10 @@ def kleene_or( result = left | right if right_mask is not None: - left_mask = left_mask.to_numpy() - right_mask = right_mask.to_numpy() + if isinstance(left_mask, BitMaskArray): + left_mask = left_mask.to_numpy() + if isinstance(right_mask, BitMaskArray): + right_mask = right_mask.to_numpy() # output is unknown where (False & NA), (NA & False), (NA & NA) left_false = ~(left | left_mask) right_false = ~(right | right_mask) From d8e715d699d78ee63b11403934e86dc9a82cab5e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Aug 2023 15:07:26 -0400 Subject: [PATCH 049/126] take and copy implementations --- pandas/_libs/arrays.pyi | 2 + pandas/_libs/arrays.pyx | 81 +++++++++++++++++++++++++++++++++++- pandas/core/algorithms.py | 15 +++++-- pandas/core/arrays/masked.py | 14 +++---- 4 files changed, 98 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 285d96df0f2bb..9c53fa93f473d 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -62,4 +62,6 @@ class BitMaskArray: def dtype(self) -> type_t[bool]: ... def any(self) -> bool: ... def sum(self) -> int: ... + def take(self, indices: np.ndarray, axis: int, fill_value: bool) -> np.ndarray: ... + def copy(self) -> Self: ... def to_numpy(self) -> np.ndarray: ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 47e3fc38a698c..78ad8bbdc4ec7 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -11,6 +11,7 @@ from libc.stdlib cimport ( free, malloc, ) +from libc.string cimport memcpy from numpy cimport ( int8_t, int64_t, @@ -248,7 +249,10 @@ cdef class BitMaskArray: self.buffer_owner = False self.bitmap = bma.bitmap - def __cinit__(self, data, parent=None): + def __cinit__(self): + self.parent = False + + def __init__(self, data, parent=None): # parent is only required to reconstruct ref-counting from pickle # but should not be called from user code if isinstance(data, np.ndarray): @@ -272,6 +276,30 @@ cdef class BitMaskArray: if self.buffer_owner: ArrowBitmapReset(&self.bitmap) + @staticmethod + cdef BitMaskArray copy_from_bitmaskarray(BitMaskArray old_bma): + """ + Constructs a new BitMaskArray from a bitmap pointer. Copies data + and manages the subsequenty lifecycle of the bitmap. + """ + # Bypass __init__ calls + cdef BitMaskArray bma = BitMaskArray.__new__(BitMaskArray) + cdef uint8_t* buf + cdef ArrowBitmap bitmap + # TODO: this leaks a bit into the internals of the nanoarrow bitmap + # We may want to upstream a BitmapCopy function instead + ArrowBitmapInit(&bitmap) + buf = malloc(old_bma.bitmap.size_bits) + memcpy(buf, old_bma.bitmap.buffer.data, old_bma.bitmap.size_bits) + bitmap.buffer.size_bytes = old_bma.bitmap.buffer.size_bytes + bitmap.size_bits = old_bma.bitmap.size_bits + bitmap.buffer.data = buf + + bma.bitmap = bitmap + bma.array_shape = old_bma.array_shape + bma.buffer_owner = True + return bma + def __setitem__(self, key, value): cdef const uint8_t[:] arr1d cdef Py_ssize_t i = 0 @@ -406,6 +434,57 @@ cdef class BitMaskArray: def sum(self) -> int: return ArrowBitCountSet(self.bitmap.buffer.data, 0, self.bitmap.size_bits) + @cython.wraparound(False) + @cython.boundscheck(False) + cdef void c_take( + self, + const int64_t[:] indices, + uint8_t[:] out, + bint fill_value, + bint allow_fill + ): + # TODO: we should try and upstream this into nanoarrow with a better algo + cdef Py_ssize_t i + cdef uint8_t value + if not allow_fill: + for i in range(indices.shape[0]): + out[i] = ArrowBitGet(self.bitmap.buffer.data, indices[i]) + else: + for i in range(indices.shape[0]): + value = ArrowBitGet(self.bitmap.buffer.data, indices[i]) + if value == 1: + out[i] = fill_value + else: + out[i] = value + + def take( + self, + const int64_t[:] indices, + int axis=0, + bint fill_value=0, + bint allow_fill=0 + ) -> np.ndarray: + if axis != 0: + raise NotImplementedError( + "BitMaskArray.take only implemented for axis=0" + ) + + # TODO: would be great to check this here, though most of these functions + # are by definition unsafe + # if indices.min() < 0: + # raise NotImplementedError( + # "BitMaskArray.take does not support negative index values" + # ) + + # TODO: indices.shape gave wrong number of dimensions, expected 1 got 8 + # len(indices) works the same as long as 1d assumption holds + result = np.empty(len(indices), dtype=bool) + self.c_take(indices, result, fill_value, allow_fill) + return result + + def copy(self): + return BitMaskArray.copy_from_bitmaskarray(self) + @cython.boundscheck(False) # TODO: Removing this causes an IndexError? Zero size? @cython.wraparound(False) @staticmethod diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3861a18316563..b7c6cb2130c58 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -21,6 +21,7 @@ iNaT, lib, ) +from pandas._libs.arrays import BitMaskArray from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -1286,21 +1287,27 @@ def take( ... fill_value=-10) array([ 10, 10, -10]) """ - if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)): + if not isinstance( + arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries, BitMaskArray) + ): # GH#52981 warnings.warn( "pd.api.extensions.take accepting non-standard inputs is deprecated " "and will raise in a future version. Pass either a numpy.ndarray, " - "ExtensionArray, Index, or Series instead.", + "ExtensionArray, Index, Series, or BitMaskArray instead.", FutureWarning, stacklevel=find_stack_level(), ) - if not is_array_like(arr): + if not isinstance(arr, BitMaskArray) and not is_array_like(arr): arr = np.asarray(arr) indices = ensure_platform_int(indices) + # BitMaskArray does not support negative indexing + if isinstance(arr, BitMaskArray) and indices.size > 0 and indices.min() < 0: + arr = arr.to_numpy() + if allow_fill: # Pandas style, -1 means NA validate_indices(indices, arr.shape[axis]) @@ -1308,7 +1315,7 @@ def take( arr, indices, axis=axis, allow_fill=True, fill_value=fill_value ) else: - # NumPy style + # NumPy / BitMaskArray style result = arr.take(indices, axis=axis) return result diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 2d22029c5fd5b..4b8e8e2681636 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -147,13 +147,9 @@ def __init__( if values.shape != mask.shape: raise ValueError("values.shape must match mask.shape") - if copy: - values = values.copy() - mask = mask.copy() - else: - if copy: - values = values.copy() - mask = mask.to_numpy() + if copy: + values = values.copy() + mask = mask.copy() self._data = values self._mask = BitMaskArray(mask) @@ -941,7 +937,7 @@ def take( ) mask = take( - self._mask.to_numpy(), + self._mask, indexer, fill_value=True, allow_fill=allow_fill, @@ -983,7 +979,7 @@ def isin(self, values) -> BooleanArray: # type: ignore[override] def copy(self) -> Self: data = self._data.copy() - mask = self._mask.to_numpy() + mask = self._mask.copy() return self._simple_new(data, mask) def unique(self) -> Self: From 37ccec332e47340a6bdb20d929f10a8cbc58c473 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Aug 2023 15:31:39 -0400 Subject: [PATCH 050/126] small optimization --- pandas/_libs/arrays.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 78ad8bbdc4ec7..edee22f57a01d 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -495,14 +495,16 @@ cdef class BitMaskArray: @cython.wraparound(False) @staticmethod cdef bint buf_any(const uint8_t* buf, Py_ssize_t nbits): - cdef Py_ssize_t i, bits_remaining = nbits % 8, size_bytes = nbits // 8 - if nbits == 0: + cdef Py_ssize_t i, bits_remaining, size_bytes + if nbits < 1: return False for i in range(size_bytes): if buf[i] > 0: return True + bits_remaining = nbits % 8 + size_bytes = nbits // 8 for i in range(bits_remaining): if ArrowBitGet(buf, nbits - i - 1): return True From 5436b04644a237abb853146577e0fbdeaf5161b4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Aug 2023 18:09:31 -0400 Subject: [PATCH 051/126] simplified buf passing and fixed bugs --- pandas/_libs/arrays.pyx | 80 +++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index edee22f57a01d..fdac88f165ab4 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -340,13 +340,11 @@ cdef class BitMaskArray: if self.bitmap.size_bits == 0: return np.empty(dtype=bool).reshape(self.array_shape) + if self.bitmap.size_bits != other_bma.bitmap.size_bits: + raise ValueError("bitmaps are not equal size") + buf = malloc(self.bitmap.size_bits) - BitMaskArray.buf_and( - self.bitmap.buffer.data, - other_bma.bitmap.buffer.data, - self.bitmap.size_bits // 8 + 1, - buf - ) + BitMaskArray.buf_and(&self.bitmap, &other_bma.bitmap, buf) result = np.empty(self.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, @@ -366,13 +364,11 @@ cdef class BitMaskArray: if self.bitmap.size_bits == 0: return np.empty(dtype=bool).reshape(self.array_shape) + if self.bitmap.size_bits != other_bma.bitmap.size_bits: + raise ValueError("bitmaps are not equal size") + buf = malloc(self.bitmap.size_bits) - BitMaskArray.buf_or( - self.bitmap.buffer.data, - other_bma.bitmap.buffer.data, - self.bitmap.size_bits // 8 + 1, - buf - ) + BitMaskArray.buf_or(&self.bitmap, &other_bma.bitmap, buf) result = np.empty(self.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, @@ -392,13 +388,11 @@ cdef class BitMaskArray: if self.bitmap.size_bits == 0: return np.empty(dtype=bool).reshape(self.array_shape) + if self.bitmap.size_bits != other_bma.bitmap.size_bits: + raise ValueError("bitmaps are not equal size") + buf = malloc(self.bitmap.size_bits) - BitMaskArray.buf_xor( - self.bitmap.buffer.data, - other_bma.bitmap.buffer.data, - self.bitmap.size_bits // 8 + 1, - buf - ) + BitMaskArray.buf_xor(&self.bitmap, &other_bma.bitmap, buf) result = np.empty(self.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, @@ -429,7 +423,7 @@ cdef class BitMaskArray: return bool def any(self) -> bool: - return BitMaskArray.buf_any(self.bitmap.buffer.data, self.bitmap.size_bits) + return BitMaskArray.buf_any(&self.bitmap) def sum(self) -> int: return ArrowBitCountSet(self.bitmap.buffer.data, 0, self.bitmap.size_bits) @@ -494,19 +488,20 @@ cdef class BitMaskArray: @cython.boundscheck(False) @cython.wraparound(False) @staticmethod - cdef bint buf_any(const uint8_t* buf, Py_ssize_t nbits): - cdef Py_ssize_t i, bits_remaining, size_bytes - if nbits < 1: + cdef bint buf_any(const ArrowBitmap* bitmap): + cdef Py_ssize_t i, bits_remaining + cdef int64_t size_bits = bitmap.size_bits + cdef const uint8_t* buf = bitmap.buffer.data + if size_bits < 1: return False - for i in range(size_bytes): + for i in range(bitmap.buffer.size_bytes): if buf[i] > 0: return True - bits_remaining = nbits % 8 - size_bytes = nbits // 8 + bits_remaining = size_bits % 8 for i in range(bits_remaining): - if ArrowBitGet(buf, nbits - i - 1): + if ArrowBitGet(buf, size_bits - i - 1): return True return False @@ -519,12 +514,17 @@ cdef class BitMaskArray: @cython.wraparound(False) @staticmethod cdef void buf_or( - const uint8_t* buf1, - const uint8_t* buf2, - Py_ssize_t nbytes, + const ArrowBitmap* bitmap1, + const ArrowBitmap* bitmap2, uint8_t* out ): cdef Py_ssize_t i + cdef const uint8_t* buf1 = bitmap1.buffer.data + cdef const uint8_t* buf2 = bitmap2.buffer.data + # Assumed caller has checked that bitmaps are equal, + # otherwise trailing comparison is undefined + cdef int64_t nbytes = bitmap1.buffer.size_bytes + for i in range(nbytes): out[i] = buf1[i] | buf2[i] @@ -532,12 +532,17 @@ cdef class BitMaskArray: @cython.wraparound(False) @staticmethod cdef void buf_xor( - const uint8_t* buf1, - const uint8_t* buf2, - Py_ssize_t nbytes, + const ArrowBitmap* bitmap1, + const ArrowBitmap* bitmap2, uint8_t* out ): cdef Py_ssize_t i + cdef const uint8_t* buf1 = bitmap1.buffer.data + cdef const uint8_t* buf2 = bitmap2.buffer.data + # Assumed caller has checked that bitmaps are equal, + # otherwise trailing comparison is undefined + cdef int64_t nbytes = bitmap1.buffer.size_bytes + for i in range(nbytes): out[i] = buf1[i] ^ buf2[i] @@ -545,12 +550,17 @@ cdef class BitMaskArray: @cython.wraparound(False) @staticmethod cdef void buf_and( - const uint8_t* buf1, - const uint8_t* buf2, - Py_ssize_t nbytes, + const ArrowBitmap* bitmap1, + const ArrowBitmap* bitmap2, uint8_t* out ): cdef Py_ssize_t i + cdef const uint8_t* buf1 = bitmap1.buffer.data + cdef const uint8_t* buf2 = bitmap2.buffer.data + # Assumed caller has checked that bitmaps are equal, + # otherwise trailing comparison is undefined + cdef int64_t nbytes = bitmap1.buffer.size_bytes + for i in range(nbytes): out[i] = buf1[i] & buf2[i] From b4aa12d3ca0158995d3e70323a4f9d30a8d048ea Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Aug 2023 19:23:55 -0400 Subject: [PATCH 052/126] setitem fastpaths --- pandas/_libs/arrays.pyx | 44 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index fdac88f165ab4..545035f397142 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -18,6 +18,7 @@ from numpy cimport ( ndarray, uint8_t, ) +from pandas.core.common import is_empty_slice cnp.import_array() @@ -37,6 +38,7 @@ cdef extern from "pandas/vendored/nanoarrow.h": void ArrowBitsUnpackInt8(const uint8_t*, int64_t, int64_t, int8_t*) int8_t ArrowBitGet(const uint8_t*, int64_t) void ArrowBitSetTo(uint8_t*, int64_t, uint8_t) + void ArrowBitsSetTo(uint8_t*, int64_t, int64_t, uint8_t) int64_t ArrowBitCountSet(const uint8_t*, int64_t, int64_t) @@ -300,6 +302,21 @@ cdef class BitMaskArray: bma.buffer_owner = True return bma + def __len__(self): + return self.bitmap.size_bits + + @cython.boundscheck(False) + @cython.wraparound(False) + cdef _set_scalar_value_from_equal_sized_array( + self, + const uint8_t[:] data, + bint value + ): + cdef Py_ssize_t i + for i in range(self.bitmap.size_bits): + if data[i]: + ArrowBitSetTo(self.bitmap.buffer.data, i, value) + def __setitem__(self, key, value): cdef const uint8_t[:] arr1d cdef Py_ssize_t i = 0 @@ -313,11 +330,28 @@ cdef class BitMaskArray: ArrowBitSetTo(self.bitmap.buffer.data, ckey, cvalue) return - arr = self.to_numpy() - arr[key] = value - arr1d = arr.ravel() - for i in range(arr1d.shape[0]): - ArrowBitSetTo(self.bitmap.buffer.data, i, arr1d[i]) + # TODO: implement fastpaths here for equal sized containers + # to avoid the to_numpy() call + if is_empty_slice(key) and isinstance(value, (int, bool)): + cvalue = value # blindly assuming ints are 0 or 1 + ArrowBitsSetTo( + self.bitmap.buffer.data, + 0, + self.bitmap.size_bits, + cvalue + ) + elif ( + isinstance(key, np.ndarray) + and key.dtype == bool + and isinstance(value, (int, bool)) + ): + self._set_scalar_value_from_equal_sized_array(key, value) + else: + arr = self.to_numpy() + arr[key] = value + arr1d = arr.ravel() + for i in range(arr1d.shape[0]): + ArrowBitSetTo(self.bitmap.buffer.data, i, arr1d[i]) def __getitem__(self, key): cdef Py_ssize_t ckey From 8c5cd15bb0226514124588795d23dc0caa56f2d2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Aug 2023 20:51:34 -0400 Subject: [PATCH 053/126] cython < 3 compat --- pandas/_libs/arrays.pyx | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 545035f397142..6dce16ecd4280 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -369,7 +369,9 @@ cdef class BitMaskArray: def __and__(self, other): cdef ndarray[uint8_t] result cdef BitMaskArray other_bma + cdef ArrowBitmap bitmap if isinstance(other, type(self)): + bitmap = self.bitmap other_bma = other if self.bitmap.size_bits == 0: return np.empty(dtype=bool).reshape(self.array_shape) @@ -378,7 +380,7 @@ cdef class BitMaskArray: raise ValueError("bitmaps are not equal size") buf = malloc(self.bitmap.size_bits) - BitMaskArray.buf_and(&self.bitmap, &other_bma.bitmap, buf) + BitMaskArray.buf_and(&bitmap, &other_bma.bitmap, buf) result = np.empty(self.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, @@ -393,8 +395,10 @@ cdef class BitMaskArray: def __or__(self, other): cdef ndarray[uint8_t] result cdef BitMaskArray other_bma + cdef ArrowBitmap bitmap if isinstance(other, type(self)): other_bma = other + bitmap = self.bitmap # Cython >= 3 can just use &self.bitmap in calls if self.bitmap.size_bits == 0: return np.empty(dtype=bool).reshape(self.array_shape) @@ -402,7 +406,7 @@ cdef class BitMaskArray: raise ValueError("bitmaps are not equal size") buf = malloc(self.bitmap.size_bits) - BitMaskArray.buf_or(&self.bitmap, &other_bma.bitmap, buf) + BitMaskArray.buf_or(&bitmap, &other_bma.bitmap, buf) result = np.empty(self.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, @@ -417,8 +421,10 @@ cdef class BitMaskArray: def __xor__(self, other): cdef ndarray[uint8_t] result cdef BitMaskArray other_bma + cdef ArrowBitmap bitmap if isinstance(other, type(self)): other_bma = other + bitmap = self.bitmap # Cython >= 3 can just use &self.bitmap in calls if self.bitmap.size_bits == 0: return np.empty(dtype=bool).reshape(self.array_shape) @@ -426,7 +432,7 @@ cdef class BitMaskArray: raise ValueError("bitmaps are not equal size") buf = malloc(self.bitmap.size_bits) - BitMaskArray.buf_xor(&self.bitmap, &other_bma.bitmap, buf) + BitMaskArray.buf_xor(&bitmap, &other_bma.bitmap, buf) result = np.empty(self.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, From e904e1888790801727fcd574cb83c6814cc5af30 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Aug 2023 22:25:24 -0400 Subject: [PATCH 054/126] Revert "simplified buf passing and fixed bugs" This reverts commit 5436b04644a237abb853146577e0fbdeaf5161b4. --- pandas/_libs/arrays.pyx | 51 +++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 6dce16ecd4280..78c3bdd595dd8 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -368,24 +368,23 @@ cdef class BitMaskArray: def __and__(self, other): cdef ndarray[uint8_t] result - cdef BitMaskArray other_bma - cdef ArrowBitmap bitmap + cdef BitMaskArray other_bma, self_ = self # self_ required for Cython < 3 + if isinstance(other, type(self)): - bitmap = self.bitmap other_bma = other - if self.bitmap.size_bits == 0: + if self_.bitmap.size_bits == 0: return np.empty(dtype=bool).reshape(self.array_shape) - if self.bitmap.size_bits != other_bma.bitmap.size_bits: + if self_.bitmap.size_bits != other_bma.bitmap.size_bits: raise ValueError("bitmaps are not equal size") - buf = malloc(self.bitmap.size_bits) - BitMaskArray.buf_and(&bitmap, &other_bma.bitmap, buf) - result = np.empty(self.bitmap.size_bits, dtype=bool) + buf = malloc(self_.bitmap.size_bits) + BitMaskArray.buf_and(&self_.bitmap, &other_bma.bitmap, buf) + result = np.empty(self_.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, buf, - self.bitmap.size_bits + self_.bitmap.size_bits ) free(buf) return result.reshape(self.array_shape) @@ -394,24 +393,23 @@ cdef class BitMaskArray: def __or__(self, other): cdef ndarray[uint8_t] result - cdef BitMaskArray other_bma - cdef ArrowBitmap bitmap + cdef BitMaskArray other_bma, self_ = self # self_ required for Cython < 3 + if isinstance(other, type(self)): other_bma = other - bitmap = self.bitmap # Cython >= 3 can just use &self.bitmap in calls - if self.bitmap.size_bits == 0: + if self_.bitmap.size_bits == 0: return np.empty(dtype=bool).reshape(self.array_shape) - if self.bitmap.size_bits != other_bma.bitmap.size_bits: + if self_.bitmap.size_bits != other_bma.bitmap.size_bits: raise ValueError("bitmaps are not equal size") - buf = malloc(self.bitmap.size_bits) - BitMaskArray.buf_or(&bitmap, &other_bma.bitmap, buf) - result = np.empty(self.bitmap.size_bits, dtype=bool) + buf = malloc(self_.bitmap.size_bits) + BitMaskArray.buf_or(&self_.bitmap, &other_bma.bitmap, buf) + result = np.empty(self_.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, buf, - self.bitmap.size_bits + self_.bitmap.size_bits ) free(buf) return result.reshape(self.array_shape) @@ -420,24 +418,23 @@ cdef class BitMaskArray: def __xor__(self, other): cdef ndarray[uint8_t] result - cdef BitMaskArray other_bma - cdef ArrowBitmap bitmap + cdef BitMaskArray other_bma, self_ = self # self_ required for Cython < 3 + if isinstance(other, type(self)): other_bma = other - bitmap = self.bitmap # Cython >= 3 can just use &self.bitmap in calls - if self.bitmap.size_bits == 0: + if self_.bitmap.size_bits == 0: return np.empty(dtype=bool).reshape(self.array_shape) - if self.bitmap.size_bits != other_bma.bitmap.size_bits: + if self_.bitmap.size_bits != other_bma.bitmap.size_bits: raise ValueError("bitmaps are not equal size") - buf = malloc(self.bitmap.size_bits) - BitMaskArray.buf_xor(&bitmap, &other_bma.bitmap, buf) - result = np.empty(self.bitmap.size_bits, dtype=bool) + buf = malloc(self_.bitmap.size_bits) + BitMaskArray.buf_xor(&self_.bitmap, &other_bma.bitmap, buf) + result = np.empty(self_.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, buf, - self.bitmap.size_bits + self_.bitmap.size_bits ) free(buf) return result.reshape(self.array_shape) From c218e5127fb18cb5b6a0acccf570673e9ec96ae0 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Aug 2023 22:28:26 -0400 Subject: [PATCH 055/126] implemented all --- pandas/_libs/arrays.pyx | 24 ++++++++++++++++++++++++ pandas/core/arrays/masked.py | 11 +++++++++++ 2 files changed, 35 insertions(+) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 78c3bdd595dd8..fb4e359219e20 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -462,6 +462,9 @@ cdef class BitMaskArray: def any(self) -> bool: return BitMaskArray.buf_any(&self.bitmap) + def all(self) -> bool: + return BitMaskArray.buf_all(&self.bitmap) + def sum(self) -> int: return ArrowBitCountSet(self.bitmap.buffer.data, 0, self.bitmap.size_bits) @@ -543,6 +546,27 @@ cdef class BitMaskArray: return False + @cython.boundscheck(False) + @cython.wraparound(False) + @staticmethod + cdef bint buf_all(const ArrowBitmap* bitmap): + cdef Py_ssize_t i, bits_remaining + cdef int64_t size_bits = bitmap.size_bits + cdef const uint8_t* buf = bitmap.buffer.data + if size_bits < 1: + return True + + for i in range(bitmap.buffer.size_bytes): + if buf[i] != 256: + return False + + bits_remaining = size_bits % 8 + for i in range(bits_remaining): + if ArrowBitGet(buf, size_bits - i - 1) == 0: + return False + + return True + # TODO: clean up signatures - don't mix nbits and nbytes # Note that in cases where the size_bits doesn't end on a word # boundary that these will still operate on the remaining bits, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 4b8e8e2681636..746afdd820247 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1383,6 +1383,17 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ nv.validate_any((), kwargs) + # attempt to avoid to_numpy call on mask for best performance + is_all_na = self._mask.all() + if is_all_na and skipna or len(self) == 0: + return False + if not skipna and not is_all_na: + return True + if not skipna and self._mask.any(): + return self.dtype.na_value + + # fallback to numpy - will be slower + # TODO: some of these conditions are likely duplicative of above checks values = self._data.copy() # error: Argument 3 to "putmask" has incompatible type "object"; # expected "Union[_SupportsArray[dtype[Any]], From 9cf54f9d8bd7585eccf64e099cfc551cfe4a8255 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Aug 2023 22:49:35 -0400 Subject: [PATCH 056/126] faster any --- pandas/core/arrays/masked.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 746afdd820247..1f52b32160302 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1385,29 +1385,31 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): # attempt to avoid to_numpy call on mask for best performance is_all_na = self._mask.all() - if is_all_na and skipna or len(self) == 0: + is_any_na = self._mask.any() + if len(self) == 0 or (skipna and is_all_na): return False - if not skipna and not is_all_na: - return True - if not skipna and self._mask.any(): - return self.dtype.na_value - # fallback to numpy - will be slower - # TODO: some of these conditions are likely duplicative of above checks - values = self._data.copy() - # error: Argument 3 to "putmask" has incompatible type "object"; - # expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], - # bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask( - values, self._mask.to_numpy(), self._falsey_value # type: ignore[arg-type] - ) + if is_any_na: + # fallback to numpy - will be slower + values = self._data.copy() + # error: Argument 3 to "putmask" has incompatible type "object"; + # expected "Union[_SupportsArray[dtype[Any]], + # _NestedSequence[_SupportsArray[dtype[Any]]], + # bool, int, float, complex, str, bytes, + # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" + np.putmask( + values, + self._mask.to_numpy(), + self._falsey_value, # type: ignore[arg-type] + ) + else: + values = self._data + result = values.any() if skipna: return result else: - if result or len(self) == 0 or not self._mask.any(): + if result or not is_any_na: return result else: return self.dtype.na_value From 4f6d035ca16fbf72c28d7f86ffe2aedbc1338bf7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Aug 2023 22:53:55 -0400 Subject: [PATCH 057/126] faster all implementation --- pandas/core/arrays/masked.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1f52b32160302..8c39aaf8994d6 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1479,21 +1479,33 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ nv.validate_all((), kwargs) - values = self._data.copy() - # error: Argument 3 to "putmask" has incompatible type "object"; - # expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], - # bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask( - values, self._mask.to_numpy(), self._truthy_value # type: ignore[arg-type] - ) + # attempt to avoid to_numpy call on mask for best performance + is_all_na = self._mask.all() + is_any_na = self._mask.any() + if len(self) == 0 or (skipna and is_all_na): + return True + + if is_any_na: + values = self._data.copy() + # error: Argument 3 to "putmask" has incompatible type "object"; + # expected "Union[_SupportsArray[dtype[Any]], + # _NestedSequence[_SupportsArray[dtype[Any]]], + # bool, int, float, complex, str, bytes, + # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" + np.putmask( + values, + self._mask.to_numpy(), + self._truthy_value, # type: ignore[arg-type] + ) + else: + values = self._data + result = values.all(axis=axis) if skipna: return result else: - if not result or len(self) == 0 or not self._mask.any(): + if not result or not self._mask.any(): return result else: return self.dtype.na_value From dca1c65fe10001cccc8b82cc0c80959f912aebf6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Aug 2023 23:31:50 -0400 Subject: [PATCH 058/126] faster reshape --- pandas/_libs/arrays.pyi | 1 + pandas/_libs/arrays.pyx | 33 ++++++++++++++++++++++++++++----- pandas/core/arrays/masked.py | 7 ++----- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 9c53fa93f473d..5d5400ad044b0 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -52,6 +52,7 @@ class BitMaskArray: def __setitem__(self, key: PositionalIndexer, value: ArrayLike | bool) -> None: ... def __getitem__(self, key: PositionalIndexer) -> bool: ... def __invert__(self) -> np.ndarray: ... + def __eq__(self, other) -> bool: ... def __and__(self, other: np.ndarray | Self) -> np.ndarray: ... def __or__(self, other: np.ndarray | Self) -> np.ndarray: ... def __xor__(self, other: np.ndarray | Self) -> np.ndarray: ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index fb4e359219e20..9be3750dd72c8 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -11,14 +11,18 @@ from libc.stdlib cimport ( free, malloc, ) -from libc.string cimport memcpy +from libc.string cimport ( + memcmp, + memcpy, +) from numpy cimport ( int8_t, int64_t, ndarray, uint8_t, ) -from pandas.core.common import is_empty_slice + +from pandas.core.common import is_null_slice cnp.import_array() @@ -292,7 +296,7 @@ cdef class BitMaskArray: # We may want to upstream a BitmapCopy function instead ArrowBitmapInit(&bitmap) buf = malloc(old_bma.bitmap.size_bits) - memcpy(buf, old_bma.bitmap.buffer.data, old_bma.bitmap.size_bits) + memcpy(buf, old_bma.bitmap.buffer.data, old_bma.bitmap.buffer.size_bytes) bitmap.buffer.size_bytes = old_bma.bitmap.buffer.size_bytes bitmap.size_bits = old_bma.bitmap.size_bits bitmap.buffer.data = buf @@ -332,7 +336,7 @@ cdef class BitMaskArray: # TODO: implement fastpaths here for equal sized containers # to avoid the to_numpy() call - if is_empty_slice(key) and isinstance(value, (int, bool)): + if is_null_slice(key) and isinstance(value, (int, bool)): cvalue = value # blindly assuming ints are 0 or 1 ArrowBitsSetTo( self.bitmap.buffer.data, @@ -355,10 +359,13 @@ cdef class BitMaskArray: def __getitem__(self, key): cdef Py_ssize_t ckey + # to_numpy can be expensive, so try to avoid for simple cases if isinstance(key, int): ckey = key if ckey >= 0 and ckey < self.bitmap.size_bits: - return ArrowBitGet(self.bitmap.buffer.data, ckey) + return bool(ArrowBitGet(self.bitmap.buffer.data, ckey)) + elif is_null_slice(key): + return self.copy() return self.to_numpy()[key] @@ -366,6 +373,22 @@ cdef class BitMaskArray: # TODO: could invert the buffer first then go to numpy return ~self.to_numpy() + def __eq__(self, other): + cdef BitMaskArray other_bma + if isinstance(other, type(self)): + other_bma = other + if ( + self.bitmap.size_bits == other_bma.bitmap.size_bits + and memcmp( + self.bitmap.buffer.data, + other_bma.bitmap.buffer.data, + self.bitmap.buffer.size_bytes + ) == 0 + ): + return True + + return False + def __and__(self, other): cdef ndarray[uint8_t] result cdef BitMaskArray other_bma, self_ = self # self_ required for Cython < 3 diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8c39aaf8994d6..d3d1c3b5f5c52 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -191,10 +191,7 @@ def __getitem__(self, item: SequenceIndexer) -> Self: def __getitem__(self, item: PositionalIndexer) -> Self | Any: item = check_array_indexer(self, item) - # TODO: need to change this to special case multiple - # indexers versus just scalar - np_mask = self._mask.to_numpy() - newmask = np_mask[item] + newmask = self._mask[item] if is_bool(newmask): # This is a scalar indexing if newmask: @@ -202,7 +199,7 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self._data[item] # sending self._mask avoids copy of buffer - if np.array_equal(newmask, np_mask): + if isinstance(newmask, BitMaskArray) and newmask == self._mask: return self._simple_new(self._data[item], self._mask) return self._simple_new(self._data[item], newmask) From 946c892d9d35e72f2c43fe91512144cfc4c9d442 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Aug 2023 07:50:40 -0400 Subject: [PATCH 059/126] Faster is_null_slice implementation --- pandas/_libs/arrays.pyx | 2 +- pandas/_libs/lib.pyx | 18 ++++++++++++++++++ pandas/core/common.py | 7 +------ 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 9be3750dd72c8..c5ff6d6c7dcb9 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -22,7 +22,7 @@ from numpy cimport ( uint8_t, ) -from pandas.core.common import is_null_slice +from pandas._libs.lib import is_null_slice cnp.import_array() diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2681115bbdcfb..228ad078c6927 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -8,6 +8,7 @@ from typing import ( ) cimport cython +from cpython cimport PyErr_Clear from cpython.datetime cimport ( PyDate_Check, PyDateTime_Check, @@ -29,6 +30,7 @@ from cpython.object cimport ( ) from cpython.ref cimport Py_INCREF from cpython.sequence cimport PySequence_Check +from cpython.slice cimport PySlice_Unpack from cpython.tuple cimport ( PyTuple_New, PyTuple_SET_ITEM, @@ -71,6 +73,7 @@ cdef extern from "Python.h": # Note: importing extern-style allows us to declare these as nogil # functions, whereas `from cpython cimport` does not. bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil + cdef Py_ssize_t PY_SSIZE_T_MAX cdef extern from "numpy/arrayobject.h": # cython's numpy.dtype specification is incorrect, which leads to @@ -1234,6 +1237,21 @@ def is_pyarrow_array(obj): return False +def is_null_slice(obj): + """ + Return True if given object + """ + cdef Py_ssize_t start, stop, step + if isinstance(obj, slice): + if PySlice_Unpack(obj, &start, &stop, &step) == 0: + if start == 0 and stop == PY_SSIZE_T_MAX and step == 1: + return True + else: + PyErr_Clear() + + return False + + _TYPE_MAP = { "categorical": "categorical", "category": "categorical", diff --git a/pandas/core/common.py b/pandas/core/common.py index 6d419098bf279..73e2e276a7a00 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -307,12 +307,7 @@ def is_null_slice(obj) -> bool: """ We have a null slice. """ - return ( - isinstance(obj, slice) - and obj.start is None - and obj.stop is None - and obj.step is None - ) + return lib.is_null_sice(obj) def is_empty_slice(obj) -> bool: From 1eb0e0120a984d57d862f9ed81457e6577efeb75 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Aug 2023 09:42:09 -0400 Subject: [PATCH 060/126] revert troublesome __getitem__ enhancements --- pandas/_libs/arrays.pyi | 1 - pandas/_libs/arrays.pyx | 21 +-------------------- pandas/core/arrays/masked.py | 6 +++++- 3 files changed, 6 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 5d5400ad044b0..9c53fa93f473d 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -52,7 +52,6 @@ class BitMaskArray: def __setitem__(self, key: PositionalIndexer, value: ArrayLike | bool) -> None: ... def __getitem__(self, key: PositionalIndexer) -> bool: ... def __invert__(self) -> np.ndarray: ... - def __eq__(self, other) -> bool: ... def __and__(self, other: np.ndarray | Self) -> np.ndarray: ... def __or__(self, other: np.ndarray | Self) -> np.ndarray: ... def __xor__(self, other: np.ndarray | Self) -> np.ndarray: ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index c5ff6d6c7dcb9..fb428ddbc2c8e 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -11,10 +11,7 @@ from libc.stdlib cimport ( free, malloc, ) -from libc.string cimport ( - memcmp, - memcpy, -) +from libc.string cimport memcpy from numpy cimport ( int8_t, int64_t, @@ -373,22 +370,6 @@ cdef class BitMaskArray: # TODO: could invert the buffer first then go to numpy return ~self.to_numpy() - def __eq__(self, other): - cdef BitMaskArray other_bma - if isinstance(other, type(self)): - other_bma = other - if ( - self.bitmap.size_bits == other_bma.bitmap.size_bits - and memcmp( - self.bitmap.buffer.data, - other_bma.bitmap.buffer.data, - self.bitmap.buffer.size_bytes - ) == 0 - ): - return True - - return False - def __and__(self, other): cdef ndarray[uint8_t] result cdef BitMaskArray other_bma, self_ = self # self_ required for Cython < 3 diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d3d1c3b5f5c52..7592f37567255 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -191,7 +191,11 @@ def __getitem__(self, item: SequenceIndexer) -> Self: def __getitem__(self, item: PositionalIndexer) -> Self | Any: item = check_array_indexer(self, item) - newmask = self._mask[item] + # TODO: some of the numpy semantics for handling 2D indexing + # are not implemented in the bitmaskarray, hence the to_numpy() + # requirement, though that slows things down + np_mask = self._mask.to_numpy() + newmask = np_mask[item] if is_bool(newmask): # This is a scalar indexing if newmask: From 07594d644a4e332ce3fffb6039473cd6ec4358e1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Aug 2023 10:01:38 -0400 Subject: [PATCH 061/126] typo fixup --- pandas/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index b93506d14da87..d75cf52149bf9 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -307,7 +307,7 @@ def is_null_slice(obj) -> bool: """ We have a null slice. """ - return lib.is_null_sice(obj) + return lib.is_null_slice(obj) def is_empty_slice(obj) -> bool: From d30b6138f49b727fefac0faabc21ad3a02232dc4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Aug 2023 10:06:01 -0400 Subject: [PATCH 062/126] finish revert --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7592f37567255..d5a5b481bb3c7 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -203,7 +203,7 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self._data[item] # sending self._mask avoids copy of buffer - if isinstance(newmask, BitMaskArray) and newmask == self._mask: + if np.array_equal(newmask, np_mask): return self._simple_new(self._data[item], self._mask) return self._simple_new(self._data[item], newmask) From 34ac61318927881649bc200348573235717c9226 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Aug 2023 12:25:57 -0400 Subject: [PATCH 063/126] reshape fast path --- pandas/_libs/arrays.pyx | 82 +++++++++++++++++++----------------- pandas/core/algorithms.py | 13 ++---- pandas/core/arrays/masked.py | 17 +++++--- 3 files changed, 56 insertions(+), 56 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index fb428ddbc2c8e..2e0614fcc2eba 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -34,6 +34,7 @@ cdef extern from "pandas/vendored/nanoarrow.h": void ArrowBitmapInit(ArrowBitmap*) void ArrowBitmapReserve(ArrowBitmap*, int64_t) + void ArrowBitmapAppendUnsafe(ArrowBitmap*, uint8_t, int64_t) void ArrowBitmapAppendInt8Unsafe(ArrowBitmap*, const int8_t *, int64_t) void ArrowBitmapReset(ArrowBitmap*) void ArrowBitsUnpackInt8(const uint8_t*, int64_t, int64_t, int8_t*) @@ -41,6 +42,7 @@ cdef extern from "pandas/vendored/nanoarrow.h": void ArrowBitSetTo(uint8_t*, int64_t, uint8_t) void ArrowBitsSetTo(uint8_t*, int64_t, int64_t, uint8_t) int64_t ArrowBitCountSet(const uint8_t*, int64_t, int64_t) + void ArrowBitmapReset(ArrowBitmap*) @cython.freelist(16) @@ -474,51 +476,53 @@ cdef class BitMaskArray: @cython.wraparound(False) @cython.boundscheck(False) - cdef void c_take( - self, - const int64_t[:] indices, - uint8_t[:] out, - bint fill_value, - bint allow_fill - ): - # TODO: we should try and upstream this into nanoarrow with a better algo + cdef int ctake_1d(self, const int64_t[:] indices, ArrowBitmap* out_bitmap): + """returns -1 in case a negative index is encountered, 0 on success""" + cdef bint value cdef Py_ssize_t i - cdef uint8_t value - if not allow_fill: - for i in range(indices.shape[0]): - out[i] = ArrowBitGet(self.bitmap.buffer.data, indices[i]) - else: - for i in range(indices.shape[0]): - value = ArrowBitGet(self.bitmap.buffer.data, indices[i]) - if value == 1: - out[i] = fill_value - else: - out[i] = value - - def take( + cdef int64_t index + cdef nindices = indices.shape[0] + + for i in range(nindices): + index = indices[i] + if index < 0: + return -1 + + value = ArrowBitGet(self.bitmap.buffer.data, index) + ArrowBitmapAppendUnsafe(out_bitmap, value, 1) + + def take_1d( self, - const int64_t[:] indices, - int axis=0, - bint fill_value=0, - bint allow_fill=0 - ) -> np.ndarray: + indices, + const int axis=0, + ): + cdef Py_ssize_t nindices = len(indices) if axis != 0: raise NotImplementedError( - "BitMaskArray.take only implemented for axis=0" + "BitMaskArray.take_1d only implemented for axis=0" + ) + + if nindices <= 0: + raise NotImplementedError( + "take_1d does not support empty takes" ) - # TODO: would be great to check this here, though most of these functions - # are by definition unsafe - # if indices.min() < 0: - # raise NotImplementedError( - # "BitMaskArray.take does not support negative index values" - # ) - - # TODO: indices.shape gave wrong number of dimensions, expected 1 got 8 - # len(indices) works the same as long as 1d assumption holds - result = np.empty(len(indices), dtype=bool) - self.c_take(indices, result, fill_value, allow_fill) - return result + cdef ArrowBitmap bitmap + cdef BitMaskArray bma = BitMaskArray.__new__(BitMaskArray) + + # TODO: this leaks a bit into the internals of the nanoarrow bitmap + # We may want to upstream a BitmapCopy function instead + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, nindices) + + if self.ctake_1d(indices, &bitmap) != 0: + ArrowBitmapReset(&bitmap) + raise ValueError("take_1d does not support negative indexing") + + bma.bitmap = bitmap + bma.array_shape = indices.shape + bma.buffer_owner = True + return bma def copy(self): return BitMaskArray.copy_from_bitmaskarray(self) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b7c6cb2130c58..26f344330dc11 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -21,7 +21,6 @@ iNaT, lib, ) -from pandas._libs.arrays import BitMaskArray from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -1287,9 +1286,7 @@ def take( ... fill_value=-10) array([ 10, 10, -10]) """ - if not isinstance( - arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries, BitMaskArray) - ): + if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)): # GH#52981 warnings.warn( "pd.api.extensions.take accepting non-standard inputs is deprecated " @@ -1299,15 +1296,11 @@ def take( stacklevel=find_stack_level(), ) - if not isinstance(arr, BitMaskArray) and not is_array_like(arr): + if not is_array_like(arr): arr = np.asarray(arr) indices = ensure_platform_int(indices) - # BitMaskArray does not support negative indexing - if isinstance(arr, BitMaskArray) and indices.size > 0 and indices.min() < 0: - arr = arr.to_numpy() - if allow_fill: # Pandas style, -1 means NA validate_indices(indices, arr.shape[axis]) @@ -1315,7 +1308,7 @@ def take( arr, indices, axis=axis, allow_fill=True, fill_value=fill_value ) else: - # NumPy / BitMaskArray style + # NumPy style result = arr.take(indices, axis=axis) return result diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d5a5b481bb3c7..c5abc85754f14 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -937,13 +937,16 @@ def take( axis=axis, ) - mask = take( - self._mask, - indexer, - fill_value=True, - allow_fill=allow_fill, - axis=axis, - ) + try: + mask = self._mask.take_1d(indexer) + except (ValueError, NotImplementedError): + mask = take( + self._mask.to_numpy(), + indexer, + fill_value=True, + allow_fill=allow_fill, + axis=axis, + ) # if we are filling # we only fill where the indexer is null From 44aae253c84de1901db951a4a7353f4e9a90de63 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Aug 2023 14:43:44 -0400 Subject: [PATCH 064/126] fix is_null_slice --- pandas/_libs/lib.pyx | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 228ad078c6927..6fed8e1339895 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -8,7 +8,6 @@ from typing import ( ) cimport cython -from cpython cimport PyErr_Clear from cpython.datetime cimport ( PyDate_Check, PyDateTime_Check, @@ -1243,11 +1242,13 @@ def is_null_slice(obj): """ cdef Py_ssize_t start, stop, step if isinstance(obj, slice): - if PySlice_Unpack(obj, &start, &stop, &step) == 0: - if start == 0 and stop == PY_SSIZE_T_MAX and step == 1: - return True - else: - PyErr_Clear() + try: + PySlice_Unpack(obj, &start, &stop, &step) + except TypeError: + return False + + if start == 0 and stop == PY_SSIZE_T_MAX and step == 1: + return True return False From 8b72d09407a24aceaa51be18767995c3a5467629 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Aug 2023 14:45:51 -0400 Subject: [PATCH 065/126] fix indexer perf boost --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index c5abc85754f14..2b6ee50875184 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -939,7 +939,7 @@ def take( try: mask = self._mask.take_1d(indexer) - except (ValueError, NotImplementedError): + except (TypeError, ValueError, NotImplementedError): mask = take( self._mask.to_numpy(), indexer, From 45d1cf06ac24b144be62f14eb3aec847b91d8a26 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Aug 2023 15:03:21 -0400 Subject: [PATCH 066/126] less to_numpy() --- pandas/_libs/arrays.pyx | 4 ++++ pandas/core/arrays/masked.py | 22 +++++++++++----------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 2e0614fcc2eba..ef1a2d7bbd9bf 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -451,6 +451,10 @@ cdef class BitMaskArray: object_state = (self.to_numpy(), self.parent) return (_unpickle_bitmaskarray, object_state, self.parent) + @property + def size(self) -> int: + return self.bitmap.size_bits + @property def nbytes(self) -> int: return self.bitmap.buffer.size_bytes diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 2b6ee50875184..e462a929cedae 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -420,16 +420,16 @@ def round(self, decimals: int = 0, *args, **kwargs): # Unary Methods def __invert__(self) -> Self: - return self._simple_new(~self._data, self._mask.to_numpy()) + return self._simple_new(~self._data, self._mask.copy()) def __neg__(self) -> Self: - return self._simple_new(-self._data, self._mask.to_numpy()) + return self._simple_new(-self._data, self._mask.copy()) def __pos__(self) -> Self: return self.copy() def __abs__(self) -> Self: - return self._simple_new(abs(self._data), self._mask.to_numpy()) + return self._simple_new(abs(self._data), self._mask.copy()) # ------------------------------------------------------------------ @@ -1236,7 +1236,7 @@ def sum( result = masked_reductions.sum( self._data, - self._mask.to_numpy(), + self._mask, skipna=skipna, min_count=min_count, axis=axis, @@ -1257,7 +1257,7 @@ def prod( result = masked_reductions.prod( self._data, - self._mask.to_numpy(), + self._mask, skipna=skipna, min_count=min_count, axis=axis, @@ -1270,7 +1270,7 @@ def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_mean((), kwargs) result = masked_reductions.mean( self._data, - self._mask.to_numpy(), + self._mask, skipna=skipna, axis=axis, ) @@ -1282,7 +1282,7 @@ def var( nv.validate_stat_ddof_func((), kwargs, fname="var") result = masked_reductions.var( self._data, - self._mask.to_numpy(), + self._mask, skipna=skipna, axis=axis, ddof=ddof, @@ -1295,7 +1295,7 @@ def std( nv.validate_stat_ddof_func((), kwargs, fname="std") result = masked_reductions.std( self._data, - self._mask.to_numpy(), + self._mask, skipna=skipna, axis=axis, ddof=ddof, @@ -1306,7 +1306,7 @@ def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_min((), kwargs) result = masked_reductions.min( self._data, - self._mask.to_numpy(), + self._mask, skipna=skipna, axis=axis, ) @@ -1316,7 +1316,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_max((), kwargs) result = masked_reductions.max( self._data, - self._mask.to_numpy(), + self._mask, skipna=skipna, axis=axis, ) @@ -1518,7 +1518,7 @@ def _accumulate( self, name: str, *, skipna: bool = True, **kwargs ) -> BaseMaskedArray: data = self._data - mask = self._mask.to_numpy() + mask = self._mask op = getattr(masked_accumulations, name) data, mask = op(data, mask, skipna=skipna, **kwargs) From 68b7191a3a57df6376797de63aada4007dac8b6e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Aug 2023 16:12:23 -0400 Subject: [PATCH 067/126] make bitmaskarray iterable --- pandas/_libs/arrays.pyi | 1 + pandas/_libs/arrays.pyx | 8 ++++++++ pandas/core/arrays/masked.py | 2 +- pandas/core/ops/mask_ops.py | 13 +++++-------- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 9c53fa93f473d..f6e342aaeadb4 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -56,6 +56,7 @@ class BitMaskArray: def __or__(self, other: np.ndarray | Self) -> np.ndarray: ... def __xor__(self, other: np.ndarray | Self) -> np.ndarray: ... def __reduce__(self) -> Tuple[Callable[[np.ndarray], Self], Tuple[np.ndarray]]: ... + def __iter__(self): ... @property def nbytes(self) -> int: ... def shape(self) -> tuple[int, ...]: ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index ef1a2d7bbd9bf..d87f6c52d79c2 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -451,6 +451,14 @@ cdef class BitMaskArray: object_state = (self.to_numpy(), self.parent) return (_unpickle_bitmaskarray, object_state, self.parent) + @cython.boundscheck(False) + @cython.wraparound(False) + def __iter__(self): + cdef Py_ssize_t i + cdef BitMaskArray self_ = self # self_ required for Cython < 3 + for i in range(self_.bitmap.size_bits): + yield bool(ArrowBitGet(self_.bitmap.buffer.data, i)) + @property def size(self) -> int: return self.bitmap.size_bits diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e462a929cedae..6deee8924fc1f 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -341,7 +341,7 @@ def __iter__(self) -> Iterator: yield val else: na_value = self.dtype.na_value - for isna_, val in zip(self._mask.to_numpy(), self._data): + for isna_, val in zip(self._mask, self._data): if isna_: yield na_value else: diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index a6dbb1db28af2..a66822f50217e 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -175,21 +175,18 @@ def kleene_and( result = left & right if right_mask is None: - if isinstance(left_mask, BitMaskArray): - left_mask = left_mask.to_numpy() - # Scalar `right` if right is libmissing.NA: - mask = (left & ~left_mask) | left_mask + if left_mask.any(): + mask = (left & ~left_mask) | left_mask else: - if not isinstance(left_mask, BitMaskArray): # already a copy - mask = left_mask.copy() + mask = left_mask.copy() if right is False: # unmask everything mask[:] = False else: - # TODO: Cython 3 changed support for radd / ror methods and may - # not be working? For now convert to NumPy + # Since we must compare to left / right it helps perf to convert + # to numpy up front, rather than deferring multiple times if isinstance(left_mask, BitMaskArray): left_mask = left_mask.to_numpy() if isinstance(right_mask, BitMaskArray): From 685f48169c2d73aca5423a1a03ed0813913181a1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Aug 2023 16:14:14 -0400 Subject: [PATCH 068/126] typing cleanups --- pandas/_libs/arrays.pyi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index f6e342aaeadb4..a6eef23b0f830 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -57,12 +57,13 @@ class BitMaskArray: def __xor__(self, other: np.ndarray | Self) -> np.ndarray: ... def __reduce__(self) -> Tuple[Callable[[np.ndarray], Self], Tuple[np.ndarray]]: ... def __iter__(self): ... + def size(self) -> int: ... @property def nbytes(self) -> int: ... def shape(self) -> tuple[int, ...]: ... def dtype(self) -> type_t[bool]: ... def any(self) -> bool: ... def sum(self) -> int: ... - def take(self, indices: np.ndarray, axis: int, fill_value: bool) -> np.ndarray: ... + def take_1d(self, indices: np.ndarray, axis: int) -> Self: ... def copy(self) -> Self: ... def to_numpy(self) -> np.ndarray: ... From 82826e90b74d5f71584adb43795a98e9bf1c189d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Aug 2023 17:09:41 -0400 Subject: [PATCH 069/126] boolean fixes --- pandas/core/arrays/masked.py | 4 ++-- pandas/core/ops/mask_ops.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 6deee8924fc1f..9020c85f00327 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1391,7 +1391,7 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): is_all_na = self._mask.all() is_any_na = self._mask.any() if len(self) == 0 or (skipna and is_all_na): - return False + return np.bool_(False) if is_any_na: # fallback to numpy - will be slower @@ -1487,7 +1487,7 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): is_all_na = self._mask.all() is_any_na = self._mask.any() if len(self) == 0 or (skipna and is_all_na): - return True + return np.bool_(True) if is_any_na: values = self._data.copy() diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index a66822f50217e..1f37b568f1ec5 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -179,6 +179,8 @@ def kleene_and( if right is libmissing.NA: if left_mask.any(): mask = (left & ~left_mask) | left_mask + else: + mask = left else: mask = left_mask.copy() if right is False: From 78e42454199d15ab171586bd5781eb11f6c7088d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 21 Aug 2023 21:27:54 -0400 Subject: [PATCH 070/126] perf in take --- pandas/_libs/arrays.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index d87f6c52d79c2..4794d0ee92f06 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -493,7 +493,7 @@ cdef class BitMaskArray: cdef bint value cdef Py_ssize_t i cdef int64_t index - cdef nindices = indices.shape[0] + cdef Py_ssize_t nindices = indices.shape[0] for i in range(nindices): index = indices[i] From 69c51c22cbcbc929ada6e5e07eaa73fd4fa398e5 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Aug 2023 16:49:11 -0400 Subject: [PATCH 071/126] fixed typing --- pandas/_libs/arrays.pyi | 4 ++++ pandas/_libs/lib.pyi | 1 + pandas/core/array_algos/masked_reductions.py | 19 ++++++++++--------- pandas/core/arrays/masked.py | 2 +- pandas/core/nanops.py | 11 +++++++++-- pandas/core/ops/mask_ops.py | 5 +++-- 6 files changed, 28 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index a6eef23b0f830..b3ed14cb1c30c 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -57,12 +57,16 @@ class BitMaskArray: def __xor__(self, other: np.ndarray | Self) -> np.ndarray: ... def __reduce__(self) -> Tuple[Callable[[np.ndarray], Self], Tuple[np.ndarray]]: ... def __iter__(self): ... + @property def size(self) -> int: ... @property def nbytes(self) -> int: ... + @property def shape(self) -> tuple[int, ...]: ... + @property def dtype(self) -> type_t[bool]: ... def any(self) -> bool: ... + def all(self) -> bool: ... def sum(self) -> int: ... def take_1d(self, indices: np.ndarray, axis: int) -> Self: ... def copy(self) -> Self: ... diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 32641319a6b96..4427a7ce734c8 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -44,6 +44,7 @@ def is_iterator(obj: object) -> bool: ... def is_scalar(val: object) -> bool: ... def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... def is_pyarrow_array(obj: object) -> bool: ... +def is_null_slice(obj: object) -> bool: ... def is_period(val: object) -> TypeGuard[Period]: ... def is_interval(val: object) -> TypeGuard[Interval]: ... def is_decimal(val: object) -> TypeGuard[Decimal]: ... diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 335fa1afc0f4e..6d4fbcc3c34e5 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -17,6 +17,7 @@ from pandas.core.nanops import check_below_min_count if TYPE_CHECKING: + from pandas._libs.arrays import BitMaskArray from pandas._typing import ( AxisInt, npt, @@ -26,7 +27,7 @@ def _reductions( func: Callable, values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitMaskArray, *, skipna: bool = True, min_count: int = 0, @@ -67,7 +68,7 @@ def _reductions( def sum( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitMaskArray, *, skipna: bool = True, min_count: int = 0, @@ -80,7 +81,7 @@ def sum( def prod( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitMaskArray, *, skipna: bool = True, min_count: int = 0, @@ -94,7 +95,7 @@ def prod( def _minmax( func: Callable, values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitMaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -131,7 +132,7 @@ def _minmax( def min( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitMaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -141,7 +142,7 @@ def min( def max( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitMaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -151,7 +152,7 @@ def max( def mean( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitMaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -163,7 +164,7 @@ def mean( def var( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitMaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -181,7 +182,7 @@ def var( def std( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitMaskArray, *, skipna: bool = True, axis: AxisInt | None = None, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9020c85f00327..766c9eb7ddffb 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -938,7 +938,7 @@ def take( ) try: - mask = self._mask.take_1d(indexer) + mask = self._mask.take_1d(indexer, axis=axis) except (TypeError, ValueError, NotImplementedError): mask = take( self._mask.to_numpy(), diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e60c42a20a9af..babbb757c8a61 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -3,6 +3,7 @@ import functools import itertools from typing import ( + TYPE_CHECKING, Any, Callable, cast, @@ -49,6 +50,10 @@ notna, ) +if TYPE_CHECKING: + from pandas._libs.arrays import BitMaskArray + + bn = import_optional_dependency("bottleneck", errors="warn") _BOTTLENECK_INSTALLED = bn is not None _USE_BOTTLENECK = False @@ -1537,7 +1542,9 @@ def _maybe_null_out( def check_below_min_count( - shape: tuple[int, ...], mask: npt.NDArray[np.bool_] | None, min_count: int + shape: tuple[int, ...], + mask: npt.NDArray[np.bool_] | BitMaskArray | None, + min_count: int, ) -> bool: """ Check for the `min_count` keyword. Returns True if below `min_count` (when @@ -1561,7 +1568,7 @@ def check_below_min_count( # no missing values, only check size non_nulls = np.prod(shape) else: - non_nulls = mask.size - mask.sum() + non_nulls = mask.size - mask.sum() # type: ignore[assignment] if non_nulls < min_count: return True return False diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index 1f37b568f1ec5..8136354659d6b 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -70,7 +70,8 @@ def kleene_or( if right is True: mask = np.zeros(left_mask.shape, left_mask.dtype) else: - left_mask = left_mask.to_numpy() + if isinstance(left_mask, BitMaskArray): + left_mask = left_mask.to_numpy() if right is libmissing.NA: mask = (~left & ~left_mask) | left_mask else: @@ -178,7 +179,7 @@ def kleene_and( # Scalar `right` if right is libmissing.NA: if left_mask.any(): - mask = (left & ~left_mask) | left_mask + mask = (left & ~left_mask) | left_mask # type: ignore[operator] else: mask = left else: From 404268f5b24fb194f0c60c38bb3349db68cc3aa2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Aug 2023 17:17:23 -0400 Subject: [PATCH 072/126] rework pickling --- pandas/_libs/arrays.pyi | 9 ++---- pandas/_libs/arrays.pyx | 71 +++++++++++++++++++++++++++++------------ 2 files changed, 54 insertions(+), 26 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index b3ed14cb1c30c..23f32cf5b3a15 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -1,8 +1,4 @@ -from typing import ( - Callable, - Sequence, - Tuple, -) +from typing import Sequence import numpy as np @@ -55,7 +51,8 @@ class BitMaskArray: def __and__(self, other: np.ndarray | Self) -> np.ndarray: ... def __or__(self, other: np.ndarray | Self) -> np.ndarray: ... def __xor__(self, other: np.ndarray | Self) -> np.ndarray: ... - def __reduce__(self) -> Tuple[Callable[[np.ndarray], Self], Tuple[np.ndarray]]: ... + def __getstate__(self) -> dict: ... + def __setstate__(self, other: dict) -> None: ... def __iter__(self): ... @property def size(self) -> int: ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 4794d0ee92f06..d310c0d2f91f5 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -224,11 +224,6 @@ cdef class NDArrayBacked: return to_concat[0]._from_backing_data(new_arr) -def _unpickle_bitmaskarray(array, parent): - bma = BitMaskArray(array, parent) - return bma - - cdef class BitMaskArray: cdef: ArrowBitmap bitmap @@ -257,23 +252,15 @@ cdef class BitMaskArray: def __cinit__(self): self.parent = False - def __init__(self, data, parent=None): - # parent is only required to reconstruct ref-counting from pickle - # but should not be called from user code + def __init__(self, data): if isinstance(data, np.ndarray): self.init_from_ndarray(data.ravel()) self.array_shape = data.shape - if parent: - self.parent = parent - else: - self.parent = None + self.parent = None elif isinstance(data, type(self)): self.init_from_bitmaskarray(data) self.array_shape = data.array_shape - if parent: - self.parent = parent - else: - self.parent = data + self.parent = data else: raise TypeError("Unsupported argument to BitMaskArray constructor") @@ -294,7 +281,7 @@ cdef class BitMaskArray: # TODO: this leaks a bit into the internals of the nanoarrow bitmap # We may want to upstream a BitmapCopy function instead ArrowBitmapInit(&bitmap) - buf = malloc(old_bma.bitmap.size_bits) + buf = malloc(old_bma.bitmap.size_bytes) memcpy(buf, old_bma.bitmap.buffer.data, old_bma.bitmap.buffer.size_bytes) bitmap.buffer.size_bytes = old_bma.bitmap.buffer.size_bytes bitmap.size_bits = old_bma.bitmap.size_bits @@ -447,9 +434,53 @@ cdef class BitMaskArray: return self.to_numpy() ^ other - def __reduce__(self): - object_state = (self.to_numpy(), self.parent) - return (_unpickle_bitmaskarray, object_state, self.parent) + def __getstate__(self): + cdef BitMaskArray self_ = self + state = { + "parent": self.parent, + "array_shape": self.array_shape, + "buffer_owner": self_.buffer_owner, + # Private ArrowBitmap attributes below + "bitmap.buffer.size_bytes": self_.bitmap.buffer.size_bytes, + "bitmap.size_bits": self_.bitmap.size_bits + } + + # Only parents own data + if self_.buffer_owner: + bitmap_data = bytearray(self_.bitmap.buffer.size_bytes) + for i in range(self_.bitmap.buffer.size_bytes): + bitmap_data[i] = self_.bitmap.buffer.data[i] + + state["bitmap_data"] = bitmap_data + + return state + + def __setstate__(self, state): + cdef ArrowBitmap bitmap + cdef BitMaskArray self_ = self, other + self.parent = state["parent"] + self.array_shape = state["array_shape"] + self_.buffer_owner = state["buffer_owner"] + + nbytes = state["bitmap.buffer.size_bytes"] + nbits = state["bitmap.size_bits"] + if not self_.buffer_owner: + other = self.parent + self_.bitmap = other.bitmap + self_.bitmap.size_bits = nbits + self_.bitmap.buffer.size_bytes = nbytes + else: + ArrowBitmapInit(&bitmap) + + buf = malloc(nbytes) + data = state["bitmap_data"] + for i in range(nbytes): + buf[i] = data[i] + + bitmap.buffer.data = buf + bitmap.buffer.size_bytes = nbytes + bitmap.size_bits = nbits + self_.bitmap = bitmap @cython.boundscheck(False) @cython.wraparound(False) From 28dd82d1f787b4af865611f8a894b19f154c2be1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Aug 2023 17:43:53 -0400 Subject: [PATCH 073/126] fixed attribute lookup --- pandas/_libs/arrays.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index d310c0d2f91f5..6a8740fb833ff 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -281,7 +281,7 @@ cdef class BitMaskArray: # TODO: this leaks a bit into the internals of the nanoarrow bitmap # We may want to upstream a BitmapCopy function instead ArrowBitmapInit(&bitmap) - buf = malloc(old_bma.bitmap.size_bytes) + buf = malloc(old_bma.bitmap.buffer.size_bytes) memcpy(buf, old_bma.bitmap.buffer.data, old_bma.bitmap.buffer.size_bytes) bitmap.buffer.size_bytes = old_bma.bitmap.buffer.size_bytes bitmap.size_bits = old_bma.bitmap.size_bits From f0bc4a2bcbb177a440ea97e376f9c876fab9fd68 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Aug 2023 19:57:32 -0400 Subject: [PATCH 074/126] More efficient invert --- pandas/_libs/arrays.pyx | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 6a8740fb833ff..83846c5f73827 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -356,8 +356,19 @@ cdef class BitMaskArray: return self.to_numpy()[key] def __invert__(self): - # TODO: could invert the buffer first then go to numpy - return ~self.to_numpy() + # TODO: should we return a mask here instead of a NumPy array? + cdef Py_ssize_t i + cdef BitMaskArray self_ = self + cdef ndarray[uint8_t] result = np.empty(self_.bitmap.size_bits, dtype=bool) + + cdef uint8_t* buf = malloc(self_.bitmap.buffer.size_bytes) + for i in range(self_.bitmap.buffer.size_bytes): + buf[i] = ~self_.bitmap.buffer.data[i] + + BitMaskArray.buffer_to_array_1d(result, buf, self_.bitmap.size_bits) + free(buf) + + return result def __and__(self, other): cdef ndarray[uint8_t] result From b6ae9bbc2791feade1942fec49f667f63ad03211 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Aug 2023 22:23:17 -0400 Subject: [PATCH 075/126] doc fix --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3827b5b5d40b2..9f715c9629ee0 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -594,7 +594,7 @@ def nbytes(self) -> int: Examples -------- >>> pd.array([1, 2, 3]).nbytes - 27 + 25 """ # If this is expensive to compute, return an approximate lower bound # on the number of bytes needed. From e1825aedd7fee0ffc3750afc37b9fc144158cef0 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Aug 2023 23:22:31 -0400 Subject: [PATCH 076/126] Have invert return BitMaskArray --- pandas/_libs/arrays.pyi | 2 +- pandas/_libs/arrays.pyx | 22 +++++++++----- pandas/core/array_algos/masked_reductions.py | 32 ++++++++++---------- pandas/core/arrays/masked.py | 12 ++++---- 4 files changed, 38 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 23f32cf5b3a15..73ca5ea86d324 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -47,7 +47,7 @@ class BitMaskArray: def __init__(self, data: np.ndarray | Self) -> None: ... def __setitem__(self, key: PositionalIndexer, value: ArrayLike | bool) -> None: ... def __getitem__(self, key: PositionalIndexer) -> bool: ... - def __invert__(self) -> np.ndarray: ... + def __invert__(self) -> Self: ... def __and__(self, other: np.ndarray | Self) -> np.ndarray: ... def __or__(self, other: np.ndarray | Self) -> np.ndarray: ... def __xor__(self, other: np.ndarray | Self) -> np.ndarray: ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 83846c5f73827..eda01241b0399 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -356,19 +356,27 @@ cdef class BitMaskArray: return self.to_numpy()[key] def __invert__(self): - # TODO: should we return a mask here instead of a NumPy array? cdef Py_ssize_t i cdef BitMaskArray self_ = self - cdef ndarray[uint8_t] result = np.empty(self_.bitmap.size_bits, dtype=bool) + cdef BitMaskArray bma = BitMaskArray.__new__(BitMaskArray) + cdef ArrowBitmap bitmap + + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) - cdef uint8_t* buf = malloc(self_.bitmap.buffer.size_bytes) for i in range(self_.bitmap.buffer.size_bytes): - buf[i] = ~self_.bitmap.buffer.data[i] + bitmap.buffer.data[i] = ~self_.bitmap.buffer.data[i] - BitMaskArray.buffer_to_array_1d(result, buf, self_.bitmap.size_bits) - free(buf) + # TODO: avoid nanoarrow internals + bitmap.size_bits = self_.bitmap.size_bits + bitmap.buffer.size_bytes = self_.bitmap.buffer.size_bytes - return result + bma.bitmap = bitmap + bma.array_shape = self.array_shape + bma.buffer_owner = True + bma.parent = None + + return bma def __and__(self, other): cdef ndarray[uint8_t] result diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 6d4fbcc3c34e5..f426e57da4380 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -13,21 +13,18 @@ import numpy as np from pandas._libs import missing as libmissing +from pandas._libs.arrays import BitMaskArray from pandas.core.nanops import check_below_min_count if TYPE_CHECKING: - from pandas._libs.arrays import BitMaskArray - from pandas._typing import ( - AxisInt, - npt, - ) + from pandas._typing import AxisInt def _reductions( func: Callable, values: np.ndarray, - mask: npt.NDArray[np.bool_] | BitMaskArray, + mask: BitMaskArray, *, skipna: bool = True, min_count: int = 0, @@ -63,12 +60,12 @@ def _reductions( ): return libmissing.NA - return func(values, where=~mask, axis=axis, **kwargs) + return func(values, where=~mask.to_numpy(), axis=axis, **kwargs) def sum( values: np.ndarray, - mask: npt.NDArray[np.bool_] | BitMaskArray, + mask: BitMaskArray, *, skipna: bool = True, min_count: int = 0, @@ -81,7 +78,7 @@ def sum( def prod( values: np.ndarray, - mask: npt.NDArray[np.bool_] | BitMaskArray, + mask: BitMaskArray, *, skipna: bool = True, min_count: int = 0, @@ -95,7 +92,7 @@ def prod( def _minmax( func: Callable, values: np.ndarray, - mask: npt.NDArray[np.bool_] | BitMaskArray, + mask: np.ndarray | BitMaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -122,7 +119,10 @@ def _minmax( else: return func(values, axis=axis) else: - subset = values[~mask] + if isinstance(mask, BitMaskArray): + subset = values[(~mask).to_numpy()] + else: + subset = values[~mask] if subset.size: return func(subset, axis=axis) else: @@ -132,7 +132,7 @@ def _minmax( def min( values: np.ndarray, - mask: npt.NDArray[np.bool_] | BitMaskArray, + mask: np.ndarray | BitMaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -142,7 +142,7 @@ def min( def max( values: np.ndarray, - mask: npt.NDArray[np.bool_] | BitMaskArray, + mask: np.ndarray | BitMaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -152,7 +152,7 @@ def max( def mean( values: np.ndarray, - mask: npt.NDArray[np.bool_] | BitMaskArray, + mask: BitMaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -164,7 +164,7 @@ def mean( def var( values: np.ndarray, - mask: npt.NDArray[np.bool_] | BitMaskArray, + mask: BitMaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -182,7 +182,7 @@ def var( def std( values: np.ndarray, - mask: npt.NDArray[np.bool_] | BitMaskArray, + mask: BitMaskArray, *, skipna: bool = True, axis: AxisInt | None = None, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 766c9eb7ddffb..363c8addea6aa 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -788,21 +788,21 @@ def _arith_method(self, other, op): if op_name == "pow": # 1 ** x is 1. - mask = np.where((self._data == 1) & ~self._mask, False, mask) + mask = np.where((self._data == 1) & (~self._mask).to_numpy(), False, mask) # x ** 0 is 1. if omask is not None: - mask = np.where((other == 0) & ~omask, False, mask) + mask = np.where((other == 0) & (~omask).to_numpy(), False, mask) elif other is not libmissing.NA: mask = np.where(other == 0, False, mask) elif op_name == "rpow": # 1 ** x is 1. if omask is not None: - mask = np.where((other == 1) & ~omask, False, mask) + mask = np.where((other == 1) & (~omask).to_numpy(), False, mask) elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. - mask = np.where((self._data == 0) & ~self._mask, False, mask) + mask = np.where((self._data == 0) & (~self._mask).to_numpy(), False, mask) return self._maybe_mask_result(result, mask) @@ -1113,8 +1113,8 @@ def equals(self, other) -> bool: if not np.array_equal(self._mask.to_numpy(), other._mask.to_numpy()): return False - left = self._data[~self._mask] - right = other._data[~other._mask] + left = self._data[(~self._mask).to_numpy()] + right = other._data[(~other._mask).to_numpy()] return array_equivalent(left, right, strict_nan=True, dtype_equal=True) def _quantile( From 5211e2e7cc828a3550313b7001b6913cc9589f5e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Aug 2023 10:30:04 -0400 Subject: [PATCH 077/126] Implemented Bitmask Concatenate --- pandas/_libs/arrays.pyi | 2 + pandas/_libs/arrays.pyx | 63 +++++++++++++++++++ .../_libs/include/pandas/bitmask_algorithms.h | 16 +++++ pandas/_libs/meson.build | 2 +- pandas/_libs/src/bitmask_algorithms.c | 43 +++++++++++++ pandas/core/arrays/masked.py | 5 +- 6 files changed, 129 insertions(+), 2 deletions(-) create mode 100644 pandas/_libs/include/pandas/bitmask_algorithms.h create mode 100644 pandas/_libs/src/bitmask_algorithms.c diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 73ca5ea86d324..f3414659b53ec 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -54,6 +54,8 @@ class BitMaskArray: def __getstate__(self) -> dict: ... def __setstate__(self, other: dict) -> None: ... def __iter__(self): ... + @staticmethod + def concatenate(objs: list[Self], axis: int) -> Self: ... @property def size(self) -> int: ... @property diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index eda01241b0399..d4bf9bd8687aa 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -44,6 +44,9 @@ cdef extern from "pandas/vendored/nanoarrow.h": int64_t ArrowBitCountSet(const uint8_t*, int64_t, int64_t) void ArrowBitmapReset(ArrowBitmap*) +cdef extern from "pandas/bitmask_algorithms.h": + void ConcatenateBitmapData(ArrowBitmap*, size_t, uint8_t*) + @cython.freelist(16) cdef class NDArrayBacked: @@ -295,6 +298,66 @@ cdef class BitMaskArray: def __len__(self): return self.bitmap.size_bits + @cython.wraparound(False) + @cython.boundscheck(False) + @staticmethod + cdef BitMaskArray c_concatenate(list objs): + cdef Py_ssize_t i + cdef int64_t bytes_needed, total_bits = 0 + cdef BitMaskArray current_bma + cdef Py_ssize_t nbitmaps = len(objs) + + cdef Py_ssize_t second_dim = 0 + if any(len(x.array_shape) > 1 for x in objs): + second_dim = objs[0].array_shape[1] + for obj in objs: + if not obj.array_shape[1] == second_dim: + raise NotImplementedError( + "BitMaskArray.concatenate does not support broadcasting" + ) + + cdef ArrowBitmap* bitmaps = malloc(sizeof(ArrowBitmap) * nbitmaps) + for i in range(nbitmaps): + current_bma = objs[i] + total_bits += current_bma.bitmap.size_bits + bitmaps[i] = current_bma.bitmap + + # Bypass __init__ calls + cdef BitMaskArray bma = BitMaskArray.__new__(BitMaskArray) + cdef ArrowBitmap bitmap + + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, total_bits) + ConcatenateBitmapData(bitmaps, nbitmaps, bitmap.buffer.data) + free(bitmaps) + + # TODO: avoid nanoarrow internals + bitmap.size_bits = total_bits + bytes_needed = total_bits // 8 + if total_bits % 8 != 0: + bytes_needed += 1 + bitmap.buffer.size_bytes = bytes_needed + + bma.bitmap = bitmap + + if second_dim != 0: + bma.array_shape = tuple((total_bits // second_dim, second_dim)) + else: + bma.array_shape = tuple((total_bits,)) + bma.buffer_owner = True + bma.parent = None + + return bma + + @staticmethod + def concatenate(objs, axis): + if axis != 0: + raise NotImplementedError( + "BitMaskArray.concatenate only implemented for axis=0" + ) + + return BitMaskArray.c_concatenate(objs) + @cython.boundscheck(False) @cython.wraparound(False) cdef _set_scalar_value_from_equal_sized_array( diff --git a/pandas/_libs/include/pandas/bitmask_algorithms.h b/pandas/_libs/include/pandas/bitmask_algorithms.h new file mode 100644 index 0000000000000..408a37f5e1495 --- /dev/null +++ b/pandas/_libs/include/pandas/bitmask_algorithms.h @@ -0,0 +1,16 @@ +#include +#include + +#include "pandas/vendored/nanoarrow.h" + +/* + Ordered concatenation of bitmasks. Masks is the data itself, + nmasks is the number of masks to concatenate, mask_nbits is the + number of bits within each mask to concatenate. + + Concatenation preserves order. + + out is assumed to have enough bytes to hold all elements. +*/ +void ConcatenateBitmapData(struct ArrowBitmap *bitmaps, size_t nbitmaps, + uint8_t *out); diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index 29b1298050619..849b839d33a87 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -62,7 +62,7 @@ libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper, _khash_primitive_helper]}, - 'arrays': {'sources': ['arrays.pyx', 'src/vendored/nanoarrow.c'], 'includes': ['include/pandas/vendored']}, + 'arrays': {'sources': ['arrays.pyx', 'src/vendored/nanoarrow.c', 'src/bitmask_algorithms.c'], 'includes': ['include/pandas/vendored']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, 'hashtable': {'sources': ['hashtable.pyx', _khash_primitive_helper, _hashtable_class_helper, _hashtable_func_helper]}, diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c new file mode 100644 index 0000000000000..06775257e3386 --- /dev/null +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -0,0 +1,43 @@ +#include + +#include "pandas/bitmask_algorithms.h" + +static const uint8_t clear_mask[8] = {0x0, 0x1, 0x3, 0x7, + 0xf, 0x1f, 0x3f, 0x7f}; + +void ConcatenateBitmapData(struct ArrowBitmap *bitmaps, size_t nbitmaps, + uint8_t *out) { + if (nbitmaps == 0) { + return; + } + + uint8_t *out_cursor = out; + // As we loop through each array, any time we end up starting + // on a word boundary we can simply use memcpy. If we are not + // so lucky we fall back to bit shifting each element + size_t start_bit_pos = 0; + for (size_t i = 0; i < nbitmaps; i++) { + struct ArrowBitmap bitmap = bitmaps[i]; + int64_t nbytes = bitmap.buffer.size_bytes; + size_t trailing_nbits = bitmap.size_bits % 8; + + if (start_bit_pos == 0) { + memcpy(out_cursor, bitmap.buffer.data, nbytes); + } else { + for (size_t j = 0; j < nbytes; j++) { + uint8_t lshifted = bitmap.buffer.data[j] << start_bit_pos; + out_cursor[j] = (out_cursor[j] & clear_mask[start_bit_pos]) | lshifted; + + uint8_t rshifted = bitmap.buffer.data[j] >> (8 - start_bit_pos); + out_cursor[j + 1] = rshifted; + } + } + + start_bit_pos = (start_bit_pos + trailing_nbits) % 8; + if (start_bit_pos == 0) { + out_cursor += nbytes; + } else { + out_cursor += nbytes - 1; + } + } +} diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 363c8addea6aa..89b3df8aa89f9 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -915,7 +915,10 @@ def _concat_same_type( axis: AxisInt = 0, ) -> Self: data = np.concatenate([x._data for x in to_concat], axis=axis) - mask = np.concatenate([x._mask.to_numpy() for x in to_concat], axis=axis) + try: + mask = BitMaskArray.concatenate([x._mask for x in to_concat], axis=axis) + except NotImplementedError: + mask = np.concatenate([x._mask.to_numpy() for x in to_concat], axis=axis) return cls(data, mask) def take( From cfa3b931c278ea99a673433139559b82965be2ac Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Aug 2023 11:17:50 -0400 Subject: [PATCH 078/126] bitmask_any moved to algorithms --- pandas/_libs/arrays.pyx | 32 +++---------- .../_libs/include/pandas/bitmask_algorithms.h | 17 +++---- pandas/_libs/src/bitmask_algorithms.c | 45 ++++++++++++++----- 3 files changed, 51 insertions(+), 43 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index d4bf9bd8687aa..875876e38b125 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -45,7 +45,8 @@ cdef extern from "pandas/vendored/nanoarrow.h": void ArrowBitmapReset(ArrowBitmap*) cdef extern from "pandas/bitmask_algorithms.h": - void ConcatenateBitmapData(ArrowBitmap*, size_t, uint8_t*) + void ConcatenateBitmapData(const ArrowBitmap**, size_t, const uint8_t*) + bint BitmapAny(const ArrowBitmap*) @cython.freelist(16) @@ -316,11 +317,13 @@ cdef class BitMaskArray: "BitMaskArray.concatenate does not support broadcasting" ) - cdef ArrowBitmap* bitmaps = malloc(sizeof(ArrowBitmap) * nbitmaps) + cdef ArrowBitmap** bitmaps = malloc( + sizeof(ArrowBitmap*) * nbitmaps + ) for i in range(nbitmaps): current_bma = objs[i] total_bits += current_bma.bitmap.size_bits - bitmaps[i] = current_bma.bitmap + bitmaps[i] = ¤t_bma.bitmap # Bypass __init__ calls cdef BitMaskArray bma = BitMaskArray.__new__(BitMaskArray) @@ -591,7 +594,7 @@ cdef class BitMaskArray: return bool def any(self) -> bool: - return BitMaskArray.buf_any(&self.bitmap) + return BitmapAny(&self.bitmap) def all(self) -> bool: return BitMaskArray.buf_all(&self.bitmap) @@ -658,27 +661,6 @@ cdef class BitMaskArray: cdef void buffer_to_array_1d(uint8_t[:] out, const uint8_t* buf, Py_ssize_t size): ArrowBitsUnpackInt8(buf, 0, size, &out[0]) - @cython.boundscheck(False) - @cython.wraparound(False) - @staticmethod - cdef bint buf_any(const ArrowBitmap* bitmap): - cdef Py_ssize_t i, bits_remaining - cdef int64_t size_bits = bitmap.size_bits - cdef const uint8_t* buf = bitmap.buffer.data - if size_bits < 1: - return False - - for i in range(bitmap.buffer.size_bytes): - if buf[i] > 0: - return True - - bits_remaining = size_bits % 8 - for i in range(bits_remaining): - if ArrowBitGet(buf, size_bits - i - 1): - return True - - return False - @cython.boundscheck(False) @cython.wraparound(False) @staticmethod diff --git a/pandas/_libs/include/pandas/bitmask_algorithms.h b/pandas/_libs/include/pandas/bitmask_algorithms.h index 408a37f5e1495..093905f887777 100644 --- a/pandas/_libs/include/pandas/bitmask_algorithms.h +++ b/pandas/_libs/include/pandas/bitmask_algorithms.h @@ -1,16 +1,17 @@ +#pragma once + +#include #include #include #include "pandas/vendored/nanoarrow.h" /* - Ordered concatenation of bitmasks. Masks is the data itself, - nmasks is the number of masks to concatenate, mask_nbits is the - number of bits within each mask to concatenate. - - Concatenation preserves order. - - out is assumed to have enough bytes to hold all elements. + Concatenates the data from an array of bitmaps with size nbitmaps + into a buffer "out". Order is preserved and out is assumed to have + enough bytes to hold all elements. */ -void ConcatenateBitmapData(struct ArrowBitmap *bitmaps, size_t nbitmaps, +void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, uint8_t *out); + +bool BitmapAny(const struct ArrowBitmap* bitmap); diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index 06775257e3386..7d6d22a12736d 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -5,30 +5,30 @@ static const uint8_t clear_mask[8] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f}; -void ConcatenateBitmapData(struct ArrowBitmap *bitmaps, size_t nbitmaps, +void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, uint8_t *out) { if (nbitmaps == 0) { return; } uint8_t *out_cursor = out; - // As we loop through each array, any time we end up starting - // on a word boundary we can simply use memcpy. If we are not - // so lucky we fall back to bit shifting each element size_t start_bit_pos = 0; for (size_t i = 0; i < nbitmaps; i++) { - struct ArrowBitmap bitmap = bitmaps[i]; - int64_t nbytes = bitmap.buffer.size_bytes; - size_t trailing_nbits = bitmap.size_bits % 8; + const struct ArrowBitmap* bitmap = bitmaps[i]; + const int64_t nbytes = bitmap->buffer.size_bytes; + const size_t trailing_nbits = bitmap->size_bits % 8; + // As we loop through each array, any time we end up starting + // on a word boundary we can simply use memcpy. If we are not + // so lucky we fall back to bit shifting each element if (start_bit_pos == 0) { - memcpy(out_cursor, bitmap.buffer.data, nbytes); + memcpy(out_cursor, bitmap->buffer.data, nbytes); } else { for (size_t j = 0; j < nbytes; j++) { - uint8_t lshifted = bitmap.buffer.data[j] << start_bit_pos; + const uint8_t lshifted = bitmap->buffer.data[j] << start_bit_pos; out_cursor[j] = (out_cursor[j] & clear_mask[start_bit_pos]) | lshifted; - uint8_t rshifted = bitmap.buffer.data[j] >> (8 - start_bit_pos); + const uint8_t rshifted = bitmap->buffer.data[j] >> (8 - start_bit_pos); out_cursor[j + 1] = rshifted; } } @@ -41,3 +41,28 @@ void ConcatenateBitmapData(struct ArrowBitmap *bitmaps, size_t nbitmaps, } } } + +bool BitmapAny(const struct ArrowBitmap* bitmap) { + const size_t nbits = bitmap->size_bits; + const size_t size_bytes = bitmap->buffer.size_bytes; + const uint8_t* buf = bitmap->buffer.data; + + if (nbits < 1) { + return false; + } + + for (size_t i = 0; i < size_bytes - 1; i++) { + if (buf[i] > 0) { + return true; + } + } + + const size_t bits_remaining = nbits % 8; + for (size_t i = 0; i < bits_remaining; i++) { + if (ArrowBitGet(buf, nbits - i - 1)) { + return true; + } + } + + return false; +} From 6df2930e7029cbba37cbc15fcfc8f490e157a188 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Aug 2023 11:48:56 -0400 Subject: [PATCH 079/126] more algorithms --- pandas/_libs/arrays.pyx | 133 ++++-------------- .../_libs/include/pandas/bitmask_algorithms.h | 18 ++- pandas/_libs/src/bitmask_algorithms.c | 100 ++++++++++++- 3 files changed, 145 insertions(+), 106 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 875876e38b125..d8c86db6d2e1d 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -47,6 +47,11 @@ cdef extern from "pandas/vendored/nanoarrow.h": cdef extern from "pandas/bitmask_algorithms.h": void ConcatenateBitmapData(const ArrowBitmap**, size_t, const uint8_t*) bint BitmapAny(const ArrowBitmap*) + bint BitmapAll(const ArrowBitmap*) + bint BitmapOr(const ArrowBitmap*, const ArrowBitmap*, ArrowBitmap*) + bint BitmapXor(const ArrowBitmap*, const ArrowBitmap*, ArrowBitmap*) + bint BitmapAnd(const ArrowBitmap*, const ArrowBitmap*, ArrowBitmap*) + bint BitmapInvert(const ArrowBitmap*, ArrowBitmap*) @cython.freelist(16) @@ -422,7 +427,6 @@ cdef class BitMaskArray: return self.to_numpy()[key] def __invert__(self): - cdef Py_ssize_t i cdef BitMaskArray self_ = self cdef BitMaskArray bma = BitMaskArray.__new__(BitMaskArray) cdef ArrowBitmap bitmap @@ -430,12 +434,7 @@ cdef class BitMaskArray: ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) - for i in range(self_.bitmap.buffer.size_bytes): - bitmap.buffer.data[i] = ~self_.bitmap.buffer.data[i] - - # TODO: avoid nanoarrow internals - bitmap.size_bits = self_.bitmap.size_bits - bitmap.buffer.size_bytes = self_.bitmap.buffer.size_bytes + BitmapInvert(&self_.bitmap, &bitmap) bma.bitmap = bitmap bma.array_shape = self.array_shape @@ -447,6 +446,7 @@ cdef class BitMaskArray: def __and__(self, other): cdef ndarray[uint8_t] result cdef BitMaskArray other_bma, self_ = self # self_ required for Cython < 3 + cdef ArrowBitmap bitmap if isinstance(other, type(self)): other_bma = other @@ -456,15 +456,17 @@ cdef class BitMaskArray: if self_.bitmap.size_bits != other_bma.bitmap.size_bits: raise ValueError("bitmaps are not equal size") - buf = malloc(self_.bitmap.size_bits) - BitMaskArray.buf_and(&self_.bitmap, &other_bma.bitmap, buf) + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + BitmapAnd(&self_.bitmap, &other_bma.bitmap, &bitmap) + result = np.empty(self_.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, - buf, - self_.bitmap.size_bits + bitmap.buffer.data, + bitmap.size_bits ) - free(buf) + ArrowBitmapReset(&bitmap) return result.reshape(self.array_shape) return self.to_numpy() & other @@ -472,6 +474,7 @@ cdef class BitMaskArray: def __or__(self, other): cdef ndarray[uint8_t] result cdef BitMaskArray other_bma, self_ = self # self_ required for Cython < 3 + cdef ArrowBitmap bitmap if isinstance(other, type(self)): other_bma = other @@ -481,15 +484,17 @@ cdef class BitMaskArray: if self_.bitmap.size_bits != other_bma.bitmap.size_bits: raise ValueError("bitmaps are not equal size") - buf = malloc(self_.bitmap.size_bits) - BitMaskArray.buf_or(&self_.bitmap, &other_bma.bitmap, buf) + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + BitmapOr(&self_.bitmap, &other_bma.bitmap, &bitmap) + result = np.empty(self_.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, - buf, - self_.bitmap.size_bits + bitmap.buffer.data, + bitmap.size_bits ) - free(buf) + ArrowBitmapReset(&bitmap) return result.reshape(self.array_shape) return self.to_numpy() | other @@ -497,6 +502,7 @@ cdef class BitMaskArray: def __xor__(self, other): cdef ndarray[uint8_t] result cdef BitMaskArray other_bma, self_ = self # self_ required for Cython < 3 + cdef ArrowBitmap bitmap if isinstance(other, type(self)): other_bma = other @@ -506,15 +512,17 @@ cdef class BitMaskArray: if self_.bitmap.size_bits != other_bma.bitmap.size_bits: raise ValueError("bitmaps are not equal size") - buf = malloc(self_.bitmap.size_bits) - BitMaskArray.buf_xor(&self_.bitmap, &other_bma.bitmap, buf) + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + BitmapXor(&self_.bitmap, &other_bma.bitmap, &bitmap) + result = np.empty(self_.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( result, - buf, - self_.bitmap.size_bits + bitmap.buffer.data, + bitmap.size_bits ) - free(buf) + ArrowBitmapReset(&bitmap) return result.reshape(self.array_shape) return self.to_numpy() ^ other @@ -597,7 +605,7 @@ cdef class BitMaskArray: return BitmapAny(&self.bitmap) def all(self) -> bool: - return BitMaskArray.buf_all(&self.bitmap) + return BitmapAll(&self.bitmap) def sum(self) -> int: return ArrowBitCountSet(self.bitmap.buffer.data, 0, self.bitmap.size_bits) @@ -661,85 +669,6 @@ cdef class BitMaskArray: cdef void buffer_to_array_1d(uint8_t[:] out, const uint8_t* buf, Py_ssize_t size): ArrowBitsUnpackInt8(buf, 0, size, &out[0]) - @cython.boundscheck(False) - @cython.wraparound(False) - @staticmethod - cdef bint buf_all(const ArrowBitmap* bitmap): - cdef Py_ssize_t i, bits_remaining - cdef int64_t size_bits = bitmap.size_bits - cdef const uint8_t* buf = bitmap.buffer.data - if size_bits < 1: - return True - - for i in range(bitmap.buffer.size_bytes): - if buf[i] != 256: - return False - - bits_remaining = size_bits % 8 - for i in range(bits_remaining): - if ArrowBitGet(buf, size_bits - i - 1) == 0: - return False - - return True - - # TODO: clean up signatures - don't mix nbits and nbytes - # Note that in cases where the size_bits doesn't end on a word - # boundary that these will still operate on the remaining bits, - # with undefined values therein - @cython.boundscheck(False) - @cython.wraparound(False) - @staticmethod - cdef void buf_or( - const ArrowBitmap* bitmap1, - const ArrowBitmap* bitmap2, - uint8_t* out - ): - cdef Py_ssize_t i - cdef const uint8_t* buf1 = bitmap1.buffer.data - cdef const uint8_t* buf2 = bitmap2.buffer.data - # Assumed caller has checked that bitmaps are equal, - # otherwise trailing comparison is undefined - cdef int64_t nbytes = bitmap1.buffer.size_bytes - - for i in range(nbytes): - out[i] = buf1[i] | buf2[i] - - @cython.boundscheck(False) - @cython.wraparound(False) - @staticmethod - cdef void buf_xor( - const ArrowBitmap* bitmap1, - const ArrowBitmap* bitmap2, - uint8_t* out - ): - cdef Py_ssize_t i - cdef const uint8_t* buf1 = bitmap1.buffer.data - cdef const uint8_t* buf2 = bitmap2.buffer.data - # Assumed caller has checked that bitmaps are equal, - # otherwise trailing comparison is undefined - cdef int64_t nbytes = bitmap1.buffer.size_bytes - - for i in range(nbytes): - out[i] = buf1[i] ^ buf2[i] - - @cython.boundscheck(False) - @cython.wraparound(False) - @staticmethod - cdef void buf_and( - const ArrowBitmap* bitmap1, - const ArrowBitmap* bitmap2, - uint8_t* out - ): - cdef Py_ssize_t i - cdef const uint8_t* buf1 = bitmap1.buffer.data - cdef const uint8_t* buf2 = bitmap2.buffer.data - # Assumed caller has checked that bitmaps are equal, - # otherwise trailing comparison is undefined - cdef int64_t nbytes = bitmap1.buffer.size_bytes - - for i in range(nbytes): - out[i] = buf1[i] & buf2[i] - def to_numpy(self) -> ndarray: cdef ndarray[uint8_t] result = np.empty(self.bitmap.size_bits, dtype=bool) BitMaskArray.buffer_to_array_1d( diff --git a/pandas/_libs/include/pandas/bitmask_algorithms.h b/pandas/_libs/include/pandas/bitmask_algorithms.h index 093905f887777..b0850b6923a03 100644 --- a/pandas/_libs/include/pandas/bitmask_algorithms.h +++ b/pandas/_libs/include/pandas/bitmask_algorithms.h @@ -14,4 +14,20 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, uint8_t *out); -bool BitmapAny(const struct ArrowBitmap* bitmap); +bool BitmapAny(const struct ArrowBitmap *bitmap); +bool BitmapAll(const struct ArrowBitmap *bitmap); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapOr(const struct ArrowBitmap *bitmap1, + const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapXor(const struct ArrowBitmap *bitmap1, + const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapAnd(const struct ArrowBitmap *bitmap1, + const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapInvert(const struct ArrowBitmap *bitmap, struct ArrowBitmap *out); diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index 7d6d22a12736d..74a4cc0695d8c 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -14,7 +14,7 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, uint8_t *out_cursor = out; size_t start_bit_pos = 0; for (size_t i = 0; i < nbitmaps; i++) { - const struct ArrowBitmap* bitmap = bitmaps[i]; + const struct ArrowBitmap *bitmap = bitmaps[i]; const int64_t nbytes = bitmap->buffer.size_bytes; const size_t trailing_nbits = bitmap->size_bits % 8; @@ -42,10 +42,10 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, } } -bool BitmapAny(const struct ArrowBitmap* bitmap) { +bool BitmapAny(const struct ArrowBitmap *bitmap) { const size_t nbits = bitmap->size_bits; const size_t size_bytes = bitmap->buffer.size_bytes; - const uint8_t* buf = bitmap->buffer.data; + const uint8_t *buf = bitmap->buffer.data; if (nbits < 1) { return false; @@ -66,3 +66,97 @@ bool BitmapAny(const struct ArrowBitmap* bitmap) { return false; } + +bool BitmapAll(const struct ArrowBitmap *bitmap) { + const size_t nbits = bitmap->size_bits; + const size_t size_bytes = bitmap->buffer.size_bytes; + const uint8_t *buf = bitmap->buffer.data; + + if (nbits < 1) { + return true; + } + + for (size_t i = 0; i < size_bytes - 1; i++) { + if (buf[i] != 0xff) { + return false; + } + } + + const size_t bits_remaining = nbits % 8; + for (size_t i = 0; i < bits_remaining; i++) { + if (ArrowBitGet(buf, nbits - i - 1) == 0) { + return false; + } + } + + return true; +} + +int BitmapOr(const struct ArrowBitmap *bitmap1, + const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out) { + if (bitmap1->size_bits != bitmap2->size_bits) { + return -1; + } else if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + return -1; + } + + for (size_t i = 0; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = bitmap1->buffer.data[i] | bitmap2->buffer.data[i]; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} + +int BitmapAnd(const struct ArrowBitmap *bitmap1, + const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out) { + if (bitmap1->size_bits != bitmap2->size_bits) { + return -1; + } else if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + return -1; + } + + for (size_t i = 0; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = bitmap1->buffer.data[i] & bitmap2->buffer.data[i]; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} + +int BitmapXor(const struct ArrowBitmap *bitmap1, + const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out) { + if (bitmap1->size_bits != bitmap2->size_bits) { + return -1; + } else if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + return -1; + } + + for (size_t i = 0; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = bitmap1->buffer.data[i] ^ bitmap2->buffer.data[i]; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} + +int BitmapInvert(const struct ArrowBitmap *bitmap1, struct ArrowBitmap *out) { + if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + return -1; + } + + for (size_t i = 0; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = ~bitmap1->buffer.data[i]; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} From 06f3b01a99272822502d62e21327f7be9b40754e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Aug 2023 12:53:42 -0400 Subject: [PATCH 080/126] C-implemented take / putmask --- pandas/_libs/arrays.pyx | 58 ++++++++----------- .../_libs/include/pandas/bitmask_algorithms.h | 8 +++ pandas/_libs/src/bitmask_algorithms.c | 56 ++++++++++++++++-- 3 files changed, 82 insertions(+), 40 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index d8c86db6d2e1d..74565fe92e36e 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -27,6 +27,7 @@ cdef extern from "pandas/vendored/nanoarrow.h": struct ArrowBuffer: uint8_t* data int64_t size_bytes + int64_t capacity_bytes struct ArrowBitmap: ArrowBuffer buffer @@ -52,6 +53,8 @@ cdef extern from "pandas/bitmask_algorithms.h": bint BitmapXor(const ArrowBitmap*, const ArrowBitmap*, ArrowBitmap*) bint BitmapAnd(const ArrowBitmap*, const ArrowBitmap*, ArrowBitmap*) bint BitmapInvert(const ArrowBitmap*, ArrowBitmap*) + bint BitmapTake(const ArrowBitmap*, const int64_t*, size_t, ArrowBitmap*) + bint BitmapPutFromBufferMask(ArrowBitmap*, const uint8_t*, size_t, uint8_t) @cython.freelist(16) @@ -293,6 +296,7 @@ cdef class BitMaskArray: buf = malloc(old_bma.bitmap.buffer.size_bytes) memcpy(buf, old_bma.bitmap.buffer.data, old_bma.bitmap.buffer.size_bytes) bitmap.buffer.size_bytes = old_bma.bitmap.buffer.size_bytes + bitmap.buffer.capacity_bytes = old_bma.bitmap.buffer.capacity_bytes bitmap.size_bits = old_bma.bitmap.size_bits bitmap.buffer.data = buf @@ -339,12 +343,13 @@ cdef class BitMaskArray: ConcatenateBitmapData(bitmaps, nbitmaps, bitmap.buffer.data) free(bitmaps) - # TODO: avoid nanoarrow internals + # TODO: avoid nanoarrow internals - maybe handle in concat function? bitmap.size_bits = total_bits bytes_needed = total_bits // 8 if total_bits % 8 != 0: bytes_needed += 1 bitmap.buffer.size_bytes = bytes_needed + bitmap.buffer.capacity_bytes = bytes_needed bma.bitmap = bitmap @@ -366,23 +371,13 @@ cdef class BitMaskArray: return BitMaskArray.c_concatenate(objs) - @cython.boundscheck(False) - @cython.wraparound(False) - cdef _set_scalar_value_from_equal_sized_array( - self, - const uint8_t[:] data, - bint value - ): - cdef Py_ssize_t i - for i in range(self.bitmap.size_bits): - if data[i]: - ArrowBitSetTo(self.bitmap.buffer.data, i, value) - def __setitem__(self, key, value): + cdef const uint8_t[:] keymask cdef const uint8_t[:] arr1d cdef Py_ssize_t i = 0 cdef Py_ssize_t ckey cdef bint cvalue + cdef BitMaskArray self_ = self if isinstance(key, int): ckey = key @@ -406,7 +401,14 @@ cdef class BitMaskArray: and key.dtype == bool and isinstance(value, (int, bool)) ): - self._set_scalar_value_from_equal_sized_array(key, value) + keymask = key + if BitmapPutFromBufferMask( + &self_.bitmap, + &keymask[0], + keymask.shape[0], + value + ) != 0: + raise ValueError("BitMaskArray.__setitem__ failed!") else: arr = self.to_numpy() arr[key] = value @@ -535,6 +537,7 @@ cdef class BitMaskArray: "buffer_owner": self_.buffer_owner, # Private ArrowBitmap attributes below "bitmap.buffer.size_bytes": self_.bitmap.buffer.size_bytes, + "bitmap.buffer.capacity_bytes": self_.bitmap.buffer.capacity_bytes, "bitmap.size_bits": self_.bitmap.size_bits } @@ -556,12 +559,14 @@ cdef class BitMaskArray: self_.buffer_owner = state["buffer_owner"] nbytes = state["bitmap.buffer.size_bytes"] + capacity_bytes = state["bitmap.buffer.capacity_bytes"] nbits = state["bitmap.size_bits"] if not self_.buffer_owner: other = self.parent self_.bitmap = other.bitmap self_.bitmap.size_bits = nbits self_.bitmap.buffer.size_bytes = nbytes + self_.bitmap.buffer.capacity_bytes = capacity_bytes else: ArrowBitmapInit(&bitmap) @@ -572,6 +577,7 @@ cdef class BitMaskArray: bitmap.buffer.data = buf bitmap.buffer.size_bytes = nbytes + bitmap.buffer.capacity_bytes = nbytes bitmap.size_bits = nbits self_.bitmap = bitmap @@ -610,28 +616,12 @@ cdef class BitMaskArray: def sum(self) -> int: return ArrowBitCountSet(self.bitmap.buffer.data, 0, self.bitmap.size_bits) - @cython.wraparound(False) - @cython.boundscheck(False) - cdef int ctake_1d(self, const int64_t[:] indices, ArrowBitmap* out_bitmap): - """returns -1 in case a negative index is encountered, 0 on success""" - cdef bint value - cdef Py_ssize_t i - cdef int64_t index - cdef Py_ssize_t nindices = indices.shape[0] - - for i in range(nindices): - index = indices[i] - if index < 0: - return -1 - - value = ArrowBitGet(self.bitmap.buffer.data, index) - ArrowBitmapAppendUnsafe(out_bitmap, value, 1) - def take_1d( self, - indices, + const int64_t[:] indices, const int axis=0, ): + cdef BitMaskArray self_ = self cdef Py_ssize_t nindices = len(indices) if axis != 0: raise NotImplementedError( @@ -651,12 +641,12 @@ cdef class BitMaskArray: ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, nindices) - if self.ctake_1d(indices, &bitmap) != 0: + if BitmapTake(&self_.bitmap, &indices[0], nindices, &bitmap) != 0: ArrowBitmapReset(&bitmap) raise ValueError("take_1d does not support negative indexing") bma.bitmap = bitmap - bma.array_shape = indices.shape + bma.array_shape = tuple((indices.shape[0],)) bma.buffer_owner = True return bma diff --git a/pandas/_libs/include/pandas/bitmask_algorithms.h b/pandas/_libs/include/pandas/bitmask_algorithms.h index b0850b6923a03..fb2bbdd711ac0 100644 --- a/pandas/_libs/include/pandas/bitmask_algorithms.h +++ b/pandas/_libs/include/pandas/bitmask_algorithms.h @@ -31,3 +31,11 @@ int BitmapAnd(const struct ArrowBitmap *bitmap1, /* Returns -1 on failure. On success returns 0 and writes to out */ int BitmapInvert(const struct ArrowBitmap *bitmap, struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapTake(const struct ArrowBitmap *bitmap, const int64_t *indices, + size_t nindices, struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapPutFromBufferMask(struct ArrowBitmap *bitmap, const uint8_t *buf, + size_t n, uint8_t value); diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index 74a4cc0695d8c..7181fc502d321 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -146,17 +146,61 @@ int BitmapXor(const struct ArrowBitmap *bitmap1, return 0; } -int BitmapInvert(const struct ArrowBitmap *bitmap1, struct ArrowBitmap *out) { - if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { +int BitmapInvert(const struct ArrowBitmap *bitmap, struct ArrowBitmap *out) { + if (!(out->buffer.capacity_bytes >= bitmap->buffer.size_bytes)) { return -1; } - for (size_t i = 0; i < bitmap1->buffer.size_bytes; i++) { - out->buffer.data[i] = ~bitmap1->buffer.data[i]; + for (size_t i = 0; i < bitmap->buffer.size_bytes; i++) { + out->buffer.data[i] = ~bitmap->buffer.data[i]; } - out->size_bits = bitmap1->size_bits; - out->buffer.size_bytes = bitmap1->buffer.size_bytes; + out->size_bits = bitmap->size_bits; + out->buffer.size_bytes = bitmap->buffer.size_bytes; + + return 0; +} + +int BitmapTake(const struct ArrowBitmap *bitmap, const int64_t *indices, + size_t nindices, struct ArrowBitmap *out) { + int64_t bytes_needed = nindices / 8; + if ((nindices % 8) > 0) { + bytes_needed += 1; + } + + if (!(out->buffer.capacity_bytes >= bytes_needed)) { + return -1; + } + + for (size_t i = 0; i < nindices; i++) { + int64_t index = indices[i]; + if (index < 0) { + return -1; + } + + int8_t value = ArrowBitGet(bitmap->buffer.data, index); + ArrowBitmapAppendUnsafe(out, value, 1); + } + + return 0; +} + +int BitmapPutFromBufferMask(struct ArrowBitmap *bitmap, const uint8_t *buf, + size_t n, uint8_t value) { + int64_t bytes_needed = n / 8; + if ((n % 8) > 0) { + bytes_needed += 1; + } + + if (bytes_needed > bitmap->buffer.capacity_bytes) { + return -1; + } + + for (size_t i = 0; i < n; i++) { + if (buf[i]) { + ArrowBitSetTo(bitmap->buffer.data, i, value); + } + } return 0; } From 9a6187459e88c9d45d133fdd81e546440351a967 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Aug 2023 13:04:32 -0400 Subject: [PATCH 081/126] clean up calling conventions --- pandas/_libs/arrays.pyx | 20 ++++--------------- .../_libs/include/pandas/bitmask_algorithms.h | 2 +- pandas/_libs/src/bitmask_algorithms.c | 13 ++++++++++-- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 74565fe92e36e..e4b7d9907fa48 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -46,7 +46,7 @@ cdef extern from "pandas/vendored/nanoarrow.h": void ArrowBitmapReset(ArrowBitmap*) cdef extern from "pandas/bitmask_algorithms.h": - void ConcatenateBitmapData(const ArrowBitmap**, size_t, const uint8_t*) + void ConcatenateBitmapData(const ArrowBitmap**, size_t, ArrowBitmap*) bint BitmapAny(const ArrowBitmap*) bint BitmapAll(const ArrowBitmap*) bint BitmapOr(const ArrowBitmap*, const ArrowBitmap*, ArrowBitmap*) @@ -313,7 +313,7 @@ cdef class BitMaskArray: @staticmethod cdef BitMaskArray c_concatenate(list objs): cdef Py_ssize_t i - cdef int64_t bytes_needed, total_bits = 0 + cdef int64_t total_bits = 0 cdef BitMaskArray current_bma cdef Py_ssize_t nbitmaps = len(objs) @@ -340,19 +340,10 @@ cdef class BitMaskArray: ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, total_bits) - ConcatenateBitmapData(bitmaps, nbitmaps, bitmap.buffer.data) + ConcatenateBitmapData(bitmaps, nbitmaps, &bitmap) free(bitmaps) - # TODO: avoid nanoarrow internals - maybe handle in concat function? - bitmap.size_bits = total_bits - bytes_needed = total_bits // 8 - if total_bits % 8 != 0: - bytes_needed += 1 - bitmap.buffer.size_bytes = bytes_needed - bitmap.buffer.capacity_bytes = bytes_needed - bma.bitmap = bitmap - if second_dim != 0: bma.array_shape = tuple((total_bits // second_dim, second_dim)) else: @@ -386,8 +377,6 @@ cdef class BitMaskArray: ArrowBitSetTo(self.bitmap.buffer.data, ckey, cvalue) return - # TODO: implement fastpaths here for equal sized containers - # to avoid the to_numpy() call if is_null_slice(key) and isinstance(value, (int, bool)): cvalue = value # blindly assuming ints are 0 or 1 ArrowBitsSetTo( @@ -400,6 +389,7 @@ cdef class BitMaskArray: isinstance(key, np.ndarray) and key.dtype == bool and isinstance(value, (int, bool)) + and len(key) == len(self) ): keymask = key if BitmapPutFromBufferMask( @@ -636,8 +626,6 @@ cdef class BitMaskArray: cdef ArrowBitmap bitmap cdef BitMaskArray bma = BitMaskArray.__new__(BitMaskArray) - # TODO: this leaks a bit into the internals of the nanoarrow bitmap - # We may want to upstream a BitmapCopy function instead ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, nindices) diff --git a/pandas/_libs/include/pandas/bitmask_algorithms.h b/pandas/_libs/include/pandas/bitmask_algorithms.h index fb2bbdd711ac0..3ca086acf77a6 100644 --- a/pandas/_libs/include/pandas/bitmask_algorithms.h +++ b/pandas/_libs/include/pandas/bitmask_algorithms.h @@ -12,7 +12,7 @@ enough bytes to hold all elements. */ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, - uint8_t *out); + struct ArrowBitmap *out); bool BitmapAny(const struct ArrowBitmap *bitmap); bool BitmapAll(const struct ArrowBitmap *bitmap); diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index 7181fc502d321..494ae411addb2 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -6,12 +6,13 @@ static const uint8_t clear_mask[8] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f}; void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, - uint8_t *out) { + struct ArrowBitmap *out) { if (nbitmaps == 0) { return; } - uint8_t *out_cursor = out; + int64_t bits_processed = 0; + uint8_t *out_cursor = out->buffer.data; size_t start_bit_pos = 0; for (size_t i = 0; i < nbitmaps; i++) { const struct ArrowBitmap *bitmap = bitmaps[i]; @@ -39,6 +40,14 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, } else { out_cursor += nbytes - 1; } + + bits_processed += bitmap->size_bits; + } + + out->size_bits = bits_processed; + out->buffer.size_bytes = bits_processed / 8; + if ((bits_processed % 8) > 0) { + out->buffer.size_bytes += 1; } } From cd2794318fbdff740b28942b2ca15e3960c7df91 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Aug 2023 15:07:25 -0400 Subject: [PATCH 082/126] fix off by one --- pandas/_libs/src/bitmask_algorithms.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index 494ae411addb2..a8d669147700a 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -66,7 +66,7 @@ bool BitmapAny(const struct ArrowBitmap *bitmap) { } } - const size_t bits_remaining = nbits % 8; + const size_t bits_remaining = nbits - ((size_bytes - 1) * 8); for (size_t i = 0; i < bits_remaining; i++) { if (ArrowBitGet(buf, nbits - i - 1)) { return true; @@ -91,7 +91,7 @@ bool BitmapAll(const struct ArrowBitmap *bitmap) { } } - const size_t bits_remaining = nbits % 8; + const size_t bits_remaining = nbits - ((size_bytes - 1) * 8); for (size_t i = 0; i < bits_remaining; i++) { if (ArrowBitGet(buf, nbits - i - 1) == 0) { return false; From dc54ca0131e7c78ebdeadd3431629e6a3253d956 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Aug 2023 15:16:21 -0400 Subject: [PATCH 083/126] make mypy happy --- pandas/_libs/arrays.pyi | 4 ++-- pandas/_libs/arrays.pyx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index f3414659b53ec..55ff7273685df 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -54,8 +54,8 @@ class BitMaskArray: def __getstate__(self) -> dict: ... def __setstate__(self, other: dict) -> None: ... def __iter__(self): ... - @staticmethod - def concatenate(objs: list[Self], axis: int) -> Self: ... + @classmethod + def concatenate(cls, objs: list[Self], axis: int) -> Self: ... @property def size(self) -> int: ... @property diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index e4b7d9907fa48..274061f6c1929 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -353,8 +353,8 @@ cdef class BitMaskArray: return bma - @staticmethod - def concatenate(objs, axis): + @classmethod + def concatenate(cls, objs, axis): if axis != 0: raise NotImplementedError( "BitMaskArray.concatenate only implemented for axis=0" From 3794ec57fce20a6509f919caf8a3441fb11fffce Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Aug 2023 16:15:20 -0400 Subject: [PATCH 084/126] fix bug moving cursor when crossing byte boundary --- pandas/_libs/src/bitmask_algorithms.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index a8d669147700a..4ff63044aca86 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -17,7 +17,6 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, for (size_t i = 0; i < nbitmaps; i++) { const struct ArrowBitmap *bitmap = bitmaps[i]; const int64_t nbytes = bitmap->buffer.size_bytes; - const size_t trailing_nbits = bitmap->size_bits % 8; // As we loop through each array, any time we end up starting // on a word boundary we can simply use memcpy. If we are not @@ -34,13 +33,12 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, } } - start_bit_pos = (start_bit_pos + trailing_nbits) % 8; - if (start_bit_pos == 0) { - out_cursor += nbytes; - } else { - out_cursor += nbytes - 1; + out_cursor += nbytes - 1; + const int64_t new_bit_position = start_bit_pos + bitmap->size_bits; + if (new_bit_position >= 8) { + out_cursor += 1; } - + start_bit_pos = new_bit_position % 8; bits_processed += bitmap->size_bits; } From 274a7b53c24df6d5c49a35bfc5bd7c75836dc59f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Aug 2023 19:04:12 -0400 Subject: [PATCH 085/126] pedantic cleanups --- pandas/_libs/src/bitmask_algorithms.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index 4ff63044aca86..b6b75f05e7847 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -24,7 +24,7 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, if (start_bit_pos == 0) { memcpy(out_cursor, bitmap->buffer.data, nbytes); } else { - for (size_t j = 0; j < nbytes; j++) { + for (int64_t j = 0; j < nbytes; j++) { const uint8_t lshifted = bitmap->buffer.data[j] << start_bit_pos; out_cursor[j] = (out_cursor[j] & clear_mask[start_bit_pos]) | lshifted; @@ -107,7 +107,7 @@ int BitmapOr(const struct ArrowBitmap *bitmap1, return -1; } - for (size_t i = 0; i < bitmap1->buffer.size_bytes; i++) { + for (int64_t i = 0; i < bitmap1->buffer.size_bytes; i++) { out->buffer.data[i] = bitmap1->buffer.data[i] | bitmap2->buffer.data[i]; } @@ -125,7 +125,7 @@ int BitmapAnd(const struct ArrowBitmap *bitmap1, return -1; } - for (size_t i = 0; i < bitmap1->buffer.size_bytes; i++) { + for (int64_t i = 0; i < bitmap1->buffer.size_bytes; i++) { out->buffer.data[i] = bitmap1->buffer.data[i] & bitmap2->buffer.data[i]; } @@ -143,7 +143,7 @@ int BitmapXor(const struct ArrowBitmap *bitmap1, return -1; } - for (size_t i = 0; i < bitmap1->buffer.size_bytes; i++) { + for (int64_t i = 0; i < bitmap1->buffer.size_bytes; i++) { out->buffer.data[i] = bitmap1->buffer.data[i] ^ bitmap2->buffer.data[i]; } @@ -158,7 +158,7 @@ int BitmapInvert(const struct ArrowBitmap *bitmap, struct ArrowBitmap *out) { return -1; } - for (size_t i = 0; i < bitmap->buffer.size_bytes; i++) { + for (int64_t i = 0; i < bitmap->buffer.size_bytes; i++) { out->buffer.data[i] = ~bitmap->buffer.data[i]; } From 4b0603847f2887996c054ec8890520b66a610327 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Aug 2023 19:28:36 -0400 Subject: [PATCH 086/126] off by one fix --- pandas/_libs/src/bitmask_algorithms.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index b6b75f05e7847..c659832fa02bc 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -17,6 +17,9 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, for (size_t i = 0; i < nbitmaps; i++) { const struct ArrowBitmap *bitmap = bitmaps[i]; const int64_t nbytes = bitmap->buffer.size_bytes; + if (nbytes == 0) { + continue; + } // As we loop through each array, any time we end up starting // on a word boundary we can simply use memcpy. If we are not From 62657840f62d3a80ed80931386dbcd042939cba3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 23 Aug 2023 19:47:06 -0400 Subject: [PATCH 087/126] Revert "fix bug moving cursor when crossing byte boundary" This reverts commit 3794ec57fce20a6509f919caf8a3441fb11fffce. --- pandas/_libs/src/bitmask_algorithms.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index c659832fa02bc..0a92be9fca253 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -20,6 +20,7 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, if (nbytes == 0) { continue; } + const size_t trailing_nbits = bitmap->size_bits % 8; // As we loop through each array, any time we end up starting // on a word boundary we can simply use memcpy. If we are not @@ -36,12 +37,13 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, } } - out_cursor += nbytes - 1; - const int64_t new_bit_position = start_bit_pos + bitmap->size_bits; - if (new_bit_position >= 8) { - out_cursor += 1; + start_bit_pos = (start_bit_pos + trailing_nbits) % 8; + if (start_bit_pos == 0) { + out_cursor += nbytes; + } else { + out_cursor += nbytes - 1; } - start_bit_pos = new_bit_position % 8; + bits_processed += bitmap->size_bits; } From cea82a500dc2e00d617b349625e1b6fbe1f4315b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 24 Aug 2023 15:24:40 -0400 Subject: [PATCH 088/126] concatenate bug fix --- pandas/_libs/src/bitmask_algorithms.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index 0a92be9fca253..ca286da4ebd17 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -37,13 +37,13 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, } } - start_bit_pos = (start_bit_pos + trailing_nbits) % 8; - if (start_bit_pos == 0) { - out_cursor += nbytes; - } else { - out_cursor += nbytes - 1; + out_cursor += nbytes; + const int64_t next_bit_pos = start_bit_pos + trailing_nbits; + if ((next_bit_pos > 0) && (next_bit_pos < 8)) { + out_cursor--; } + start_bit_pos = next_bit_pos % 8; bits_processed += bitmap->size_bits; } From 0d529e86a54773eee7735ee41ef8ee85dcc69639 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 24 Aug 2023 22:12:40 -0400 Subject: [PATCH 089/126] fixed bounds issues --- pandas/_libs/src/bitmask_algorithms.c | 32 +++++++++++++++------------ 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index ca286da4ebd17..3ce52d54bc6ba 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -20,31 +20,35 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, if (nbytes == 0) { continue; } - const size_t trailing_nbits = bitmap->size_bits % 8; + const size_t bitmap_rem = bitmap->size_bits % 8; // As we loop through each array, any time we end up starting // on a word boundary we can simply use memcpy. If we are not // so lucky we fall back to bit shifting each element if (start_bit_pos == 0) { - memcpy(out_cursor, bitmap->buffer.data, nbytes); + const size_t index = bits_processed / 8; + memcpy(&out_cursor[index], bitmap->buffer.data, nbytes); + bits_processed += bitmap->size_bits; } else { for (int64_t j = 0; j < nbytes; j++) { const uint8_t lshifted = bitmap->buffer.data[j] << start_bit_pos; - out_cursor[j] = (out_cursor[j] & clear_mask[start_bit_pos]) | lshifted; - - const uint8_t rshifted = bitmap->buffer.data[j] >> (8 - start_bit_pos); - out_cursor[j + 1] = rshifted; + const size_t index = bits_processed / 8; + out_cursor[index] = (out_cursor[index] & clear_mask[start_bit_pos]) | lshifted; + + if (index < out->buffer.capacity_bytes - 1) { + const uint8_t rshifted = bitmap->buffer.data[j] >> (8 - start_bit_pos); + out_cursor[index + 1] = rshifted; + } + + if ((j == nbytes - 1) && (bitmap_rem > 0)){ + bits_processed += bitmap_rem; + } else { + bits_processed += 8; + } } } - out_cursor += nbytes; - const int64_t next_bit_pos = start_bit_pos + trailing_nbits; - if ((next_bit_pos > 0) && (next_bit_pos < 8)) { - out_cursor--; - } - - start_bit_pos = next_bit_pos % 8; - bits_processed += bitmap->size_bits; + start_bit_pos = (start_bit_pos + bitmap_rem) % 8; } out->size_bits = bits_processed; From e80e70913f58df4993ecea53f5ecb7de4a175da1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 24 Aug 2023 22:18:42 -0400 Subject: [PATCH 090/126] Revert "fixed bounds issues" This reverts commit 0d529e86a54773eee7735ee41ef8ee85dcc69639. --- pandas/_libs/src/bitmask_algorithms.c | 32 ++++++++++++--------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index 3ce52d54bc6ba..ca286da4ebd17 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -20,35 +20,31 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, if (nbytes == 0) { continue; } - const size_t bitmap_rem = bitmap->size_bits % 8; + const size_t trailing_nbits = bitmap->size_bits % 8; // As we loop through each array, any time we end up starting // on a word boundary we can simply use memcpy. If we are not // so lucky we fall back to bit shifting each element if (start_bit_pos == 0) { - const size_t index = bits_processed / 8; - memcpy(&out_cursor[index], bitmap->buffer.data, nbytes); - bits_processed += bitmap->size_bits; + memcpy(out_cursor, bitmap->buffer.data, nbytes); } else { for (int64_t j = 0; j < nbytes; j++) { const uint8_t lshifted = bitmap->buffer.data[j] << start_bit_pos; - const size_t index = bits_processed / 8; - out_cursor[index] = (out_cursor[index] & clear_mask[start_bit_pos]) | lshifted; - - if (index < out->buffer.capacity_bytes - 1) { - const uint8_t rshifted = bitmap->buffer.data[j] >> (8 - start_bit_pos); - out_cursor[index + 1] = rshifted; - } - - if ((j == nbytes - 1) && (bitmap_rem > 0)){ - bits_processed += bitmap_rem; - } else { - bits_processed += 8; - } + out_cursor[j] = (out_cursor[j] & clear_mask[start_bit_pos]) | lshifted; + + const uint8_t rshifted = bitmap->buffer.data[j] >> (8 - start_bit_pos); + out_cursor[j + 1] = rshifted; } } - start_bit_pos = (start_bit_pos + bitmap_rem) % 8; + out_cursor += nbytes; + const int64_t next_bit_pos = start_bit_pos + trailing_nbits; + if ((next_bit_pos > 0) && (next_bit_pos < 8)) { + out_cursor--; + } + + start_bit_pos = next_bit_pos % 8; + bits_processed += bitmap->size_bits; } out->size_bits = bits_processed; From 8689c993e1b4cc51b1c86853eb119de8a2912946 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 24 Aug 2023 22:24:33 -0400 Subject: [PATCH 091/126] faster impl --- pandas/_libs/src/bitmask_algorithms.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index ca286da4ebd17..8c15cf823ded6 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -32,8 +32,10 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, const uint8_t lshifted = bitmap->buffer.data[j] << start_bit_pos; out_cursor[j] = (out_cursor[j] & clear_mask[start_bit_pos]) | lshifted; - const uint8_t rshifted = bitmap->buffer.data[j] >> (8 - start_bit_pos); - out_cursor[j + 1] = rshifted; + if (out_cursor - out->buffer.data < out->buffer.capacity_bytes - 1) { + const uint8_t rshifted = bitmap->buffer.data[j] >> (8 - start_bit_pos); + out_cursor[j + 1] = rshifted; + } } } From 4ed187598e8dc577e22f150e7daa72f399806606 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 24 Aug 2023 22:42:58 -0400 Subject: [PATCH 092/126] move condition out of loop --- pandas/_libs/src/bitmask_algorithms.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index 8c15cf823ded6..e74f7a2aa6ab3 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -28,14 +28,24 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, if (start_bit_pos == 0) { memcpy(out_cursor, bitmap->buffer.data, nbytes); } else { - for (int64_t j = 0; j < nbytes; j++) { + for (int64_t j = 0; j < nbytes - 1; j++) { const uint8_t lshifted = bitmap->buffer.data[j] << start_bit_pos; out_cursor[j] = (out_cursor[j] & clear_mask[start_bit_pos]) | lshifted; - if (out_cursor - out->buffer.data < out->buffer.capacity_bytes - 1) { - const uint8_t rshifted = bitmap->buffer.data[j] >> (8 - start_bit_pos); - out_cursor[j + 1] = rshifted; - } + const uint8_t rshifted = bitmap->buffer.data[j] >> (8 - start_bit_pos); + out_cursor[j + 1] = rshifted; + } + + // last byte can overrun - check outside loop for performance + const size_t index = nbytes - 1; + const uint8_t lshifted = bitmap->buffer.data[index] << start_bit_pos; + out_cursor[index] = + (out_cursor[index] & clear_mask[start_bit_pos]) | lshifted; + + if (out_cursor - out->buffer.data < out->buffer.capacity_bytes - 1) { + const uint8_t rshifted = + bitmap->buffer.data[index] >> (8 - start_bit_pos); + out_cursor[index + 1] = rshifted; } } From 24c381457dd0180518b17888e3c53afed4253d61 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 25 Aug 2023 08:40:08 -0400 Subject: [PATCH 093/126] memory benchmark --- asv_bench/benchmarks/array.py | 11 +++++++++++ pandas/_libs/arrays.pyx | 16 ++++++++-------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 09c4acc0ab309..e545fcab513f0 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -28,6 +28,17 @@ def time_from_float_array(self): pd.array(self.values_float, dtype="boolean") +class BooleanArrayMem: + def setup_cache(self): + N = 250_000 + data = np.array([True] * N) + mask = np.array([False] * N) + return [pd.arrays.BooleanArray(data, mask)] * 500 + + def peakmem_array(self, arrays): + return [~x for x in arrays] + + class IntegerArray: def setup(self): N = 250_000 diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 274061f6c1929..35e1c59d92631 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -6,10 +6,10 @@ cimport cython import numpy as np cimport numpy as cnp -from cpython cimport PyErr_Clear -from libc.stdlib cimport ( - free, - malloc, +from cpython cimport ( + PyErr_Clear, + PyMem_Free, + PyMem_Malloc, ) from libc.string cimport memcpy from numpy cimport ( @@ -293,7 +293,7 @@ cdef class BitMaskArray: # TODO: this leaks a bit into the internals of the nanoarrow bitmap # We may want to upstream a BitmapCopy function instead ArrowBitmapInit(&bitmap) - buf = malloc(old_bma.bitmap.buffer.size_bytes) + buf = PyMem_Malloc(old_bma.bitmap.buffer.size_bytes) memcpy(buf, old_bma.bitmap.buffer.data, old_bma.bitmap.buffer.size_bytes) bitmap.buffer.size_bytes = old_bma.bitmap.buffer.size_bytes bitmap.buffer.capacity_bytes = old_bma.bitmap.buffer.capacity_bytes @@ -326,7 +326,7 @@ cdef class BitMaskArray: "BitMaskArray.concatenate does not support broadcasting" ) - cdef ArrowBitmap** bitmaps = malloc( + cdef ArrowBitmap** bitmaps = PyMem_Malloc( sizeof(ArrowBitmap*) * nbitmaps ) for i in range(nbitmaps): @@ -341,7 +341,7 @@ cdef class BitMaskArray: ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, total_bits) ConcatenateBitmapData(bitmaps, nbitmaps, &bitmap) - free(bitmaps) + PyMem_Free(bitmaps) bma.bitmap = bitmap if second_dim != 0: @@ -560,7 +560,7 @@ cdef class BitMaskArray: else: ArrowBitmapInit(&bitmap) - buf = malloc(nbytes) + buf = PyMem_Malloc(nbytes) data = state["bitmap_data"] for i in range(nbytes): buf[i] = data[i] From 5ad89648de26418d7c02728ca5015ef4f6d09107 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 25 Aug 2023 09:34:31 -0400 Subject: [PATCH 094/126] use c standard malloc/free --- pandas/_libs/arrays.pyx | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 35e1c59d92631..274061f6c1929 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -6,10 +6,10 @@ cimport cython import numpy as np cimport numpy as cnp -from cpython cimport ( - PyErr_Clear, - PyMem_Free, - PyMem_Malloc, +from cpython cimport PyErr_Clear +from libc.stdlib cimport ( + free, + malloc, ) from libc.string cimport memcpy from numpy cimport ( @@ -293,7 +293,7 @@ cdef class BitMaskArray: # TODO: this leaks a bit into the internals of the nanoarrow bitmap # We may want to upstream a BitmapCopy function instead ArrowBitmapInit(&bitmap) - buf = PyMem_Malloc(old_bma.bitmap.buffer.size_bytes) + buf = malloc(old_bma.bitmap.buffer.size_bytes) memcpy(buf, old_bma.bitmap.buffer.data, old_bma.bitmap.buffer.size_bytes) bitmap.buffer.size_bytes = old_bma.bitmap.buffer.size_bytes bitmap.buffer.capacity_bytes = old_bma.bitmap.buffer.capacity_bytes @@ -326,7 +326,7 @@ cdef class BitMaskArray: "BitMaskArray.concatenate does not support broadcasting" ) - cdef ArrowBitmap** bitmaps = PyMem_Malloc( + cdef ArrowBitmap** bitmaps = malloc( sizeof(ArrowBitmap*) * nbitmaps ) for i in range(nbitmaps): @@ -341,7 +341,7 @@ cdef class BitMaskArray: ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, total_bits) ConcatenateBitmapData(bitmaps, nbitmaps, &bitmap) - PyMem_Free(bitmaps) + free(bitmaps) bma.bitmap = bitmap if second_dim != 0: @@ -560,7 +560,7 @@ cdef class BitMaskArray: else: ArrowBitmapInit(&bitmap) - buf = PyMem_Malloc(nbytes) + buf = malloc(nbytes) data = state["bitmap_data"] for i in range(nbytes): buf[i] = data[i] From b64ba05d76ccac18643d14399988eaedfda42ab7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 25 Aug 2023 09:54:30 -0400 Subject: [PATCH 095/126] added repr for bitmaskarray --- pandas/_libs/arrays.pyx | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 274061f6c1929..9ea8a28aea9f5 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -308,6 +308,24 @@ cdef class BitMaskArray: def __len__(self): return self.bitmap.size_bits + def __repr__(self): + cdef Py_ssize_t i, nbytes = self.bitmap.buffer.size_bytes + arr_bytes = bytearray(nbytes) + for i in range(nbytes): + arr_bytes[i] = self.bitmap.buffer.data[i] + + if self.parent: + par = object.__repr__(self.parent) + else: + par = None + + shape = self.array_shape + data = repr(arr_bytes) + + return ( + f"{object.__repr__(self)}\nparent: {par}\nshape: {shape}\ndata: {data}\n" + ) + @cython.wraparound(False) @cython.boundscheck(False) @staticmethod From a51dfe9ba6b8cb27babf1c84c0d9dca7ac052dbc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 25 Aug 2023 12:31:36 -0400 Subject: [PATCH 096/126] more tests and better repr --- pandas/_libs/arrays.pyi | 2 + pandas/_libs/arrays.pyx | 23 +- pandas/tests/arrays/masked/test_bitmask.py | 421 +++++++++++++++++++++ 3 files changed, 440 insertions(+), 6 deletions(-) create mode 100644 pandas/tests/arrays/masked/test_bitmask.py diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 55ff7273685df..643324bde838e 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -61,6 +61,8 @@ class BitMaskArray: @property def nbytes(self) -> int: ... @property + def bytes(self) -> bytes: ... + @property def shape(self) -> tuple[int, ...]: ... @property def dtype(self) -> type_t[bool]: ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 9ea8a28aea9f5..cd348bdebee2d 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -309,18 +309,13 @@ cdef class BitMaskArray: return self.bitmap.size_bits def __repr__(self): - cdef Py_ssize_t i, nbytes = self.bitmap.buffer.size_bytes - arr_bytes = bytearray(nbytes) - for i in range(nbytes): - arr_bytes[i] = self.bitmap.buffer.data[i] - if self.parent: par = object.__repr__(self.parent) else: par = None shape = self.array_shape - data = repr(arr_bytes) + data = self.bytes return ( f"{object.__repr__(self)}\nparent: {par}\nshape: {shape}\ndata: {data}\n" @@ -437,6 +432,10 @@ cdef class BitMaskArray: return self.to_numpy()[key] def __invert__(self): + # note that this inverts the entire byte, even if the + # bitmap only uses a few of the bits within that byte + # the remaining bits of the byte are of undefined value + # so be sure to only check bytes we need cdef BitMaskArray self_ = self cdef BitMaskArray bma = BitMaskArray.__new__(BitMaskArray) cdef ArrowBitmap bitmap @@ -459,6 +458,7 @@ cdef class BitMaskArray: cdef ArrowBitmap bitmap if isinstance(other, type(self)): + # TODO: maybe should return Self here instead of ndarray other_bma = other if self_.bitmap.size_bits == 0: return np.empty(dtype=bool).reshape(self.array_shape) @@ -487,6 +487,7 @@ cdef class BitMaskArray: cdef ArrowBitmap bitmap if isinstance(other, type(self)): + # TODO: maybe should return Self here instead of ndarray other_bma = other if self_.bitmap.size_bits == 0: return np.empty(dtype=bool).reshape(self.array_shape) @@ -515,6 +516,7 @@ cdef class BitMaskArray: cdef ArrowBitmap bitmap if isinstance(other, type(self)): + # TODO: maybe should return Self here instead of ndarray other_bma = other if self_.bitmap.size_bits == 0: return np.empty(dtype=bool).reshape(self.array_shape) @@ -605,6 +607,15 @@ cdef class BitMaskArray: def nbytes(self) -> int: return self.bitmap.buffer.size_bytes + @property + def bytes(self): + cdef Py_ssize_t i, nbytes = self.bitmap.buffer.size_bytes + arr_bytes = bytearray(nbytes) + for i in range(nbytes): + arr_bytes[i] = self.bitmap.buffer.data[i] + + return bytes(arr_bytes) + @property def shape(self): """Strictly for NumPy compat in mask_ops""" diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py new file mode 100644 index 0000000000000..1e1258984edaf --- /dev/null +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -0,0 +1,421 @@ +import itertools +import pickle + +import numpy as np +import pytest + +from pandas._libs.arrays import BitMaskArray + +import pandas._testing as tm + + +@pytest.mark.parametrize( + "array,expected", + [ + (np.array([False, False]), bytes([0x0])), + (np.array([True, False]), bytes([0x1])), + (np.array([False, True]), bytes([0x2])), + (np.array([True, True]), bytes([0x3])), + (np.array([True, False] * 8), bytes([0x55, 0x55])), + ], +) +def test_constructor_ndarray(array, expected): + bma = BitMaskArray(array) + assert bma.bytes == expected + assert not bma.parent + assert bma.array_shape == array.shape + + +@pytest.mark.parametrize( + "parent,expected", + [ + (BitMaskArray(np.array([False, False])), bytes([0x0])), + (BitMaskArray(np.array([True, False])), bytes([0x1])), + (BitMaskArray(np.array([False, True])), bytes([0x2])), + (BitMaskArray(np.array([True, True])), bytes([0x3])), + (BitMaskArray(np.array([True, False] * 8)), bytes([0x55, 0x55])), + ], +) +def test_constructor_bitmap(parent, expected): + bma = BitMaskArray(parent) + assert bma.bytes == expected + assert bma.parent is parent + assert bma.array_shape == parent.shape + + +def test_len(): + bma = BitMaskArray(np.array([True, False, False])) + assert len(bma) == 3 + + +def test_repr_no_parent(): + bma = BitMaskArray(np.array([True, False, False])) + result = repr(bma) + assert "parent: None" in result + assert "shape: (3,)" in result + assert "data: b'\\x01'" in result + + +def test_repr_parent(): + parent = BitMaskArray(np.array([False, False, True])) + bma = BitMaskArray(parent) + result = repr(bma) + parent_id = hex(id(parent)) + assert f"parent: > 1) & 1 == 0 + assert (result.bytes[0] >> 2) & 1 == 1 + + +@pytest.mark.parametrize( + "indexer,expected", + [ + ([0, 1], np.array([True, False])), + (np.array([2, 1]), np.array([True, False])), + (slice(1, 2), np.array([False])), + ], +) +def test_getitem_numpy_fallback(indexer, expected): + bma = BitMaskArray(np.array([True, False, True])) + result = bma[indexer] + + tm.assert_numpy_array_equal(result, expected) + + +def test_setitem_scalar(): + bma = BitMaskArray(np.array([True, False, True])) + + bma[0] = False + assert not bma[0] + + bma[:] = True + assert bma[0] and bma[1] and bma[2] + + bma[np.array([False, False, True])] = False + assert bma[0] and bma[1] and not bma[2] + + bma[[False, True, False]] = False + assert bma[0] and not bma[1] and not bma[2] + + +def test_setitem_array(): + bma = BitMaskArray(np.array([True, False, True])) + + bma[:] = [False, True, False] + assert not bma[0] and bma[1] and not bma[2] + + bma[:] = np.array([True, False, True]) + assert bma[0] and not bma[1] and bma[2] + + +def test_invert(): + result1 = ~BitMaskArray(np.array([True, False])) + assert (result1.bytes[0] & 0x1) == 0 + assert ((result1.bytes[0] >> 1) & 0x1) == 1 + + result2 = ~BitMaskArray(np.array([False, True])) + assert (result2.bytes[0] & 0x1) == 1 + assert ((result2.bytes[0] >> 1) & 0x1) == 0 + + +@pytest.mark.parametrize("rhs_as_bitmask", [True, False]) +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], [True], [True]), + ([True], [False], [False]), + ([False], [False], [False]), + ([True] * 10, [True] * 10, [True] * 10), + ([False] * 10, [True] * 10, [False] * 10), + ], +) +def test_and(rhs_as_bitmask, lhs, rhs, expected): + bma1 = BitMaskArray(np.array(lhs)) + + if rhs_as_bitmask: + bma2 = BitMaskArray(np.array(rhs)) + else: + bma2 = np.array(rhs) + + expected = np.array(expected) + result = bma1 & bma2 + assert (result == expected).all() + + +@pytest.mark.parametrize("rhs_as_bitmask", [True, False]) +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], [True], [True]), + ([True], [False], [True]), + ([False], [False], [False]), + ([True] * 10, [True] * 10, [True] * 10), + ([False] * 10, [True] * 10, [True] * 10), + ], +) +def test_or(rhs_as_bitmask, lhs, rhs, expected): + bma1 = BitMaskArray(np.array(lhs)) + + if rhs_as_bitmask: + bma2 = BitMaskArray(np.array(rhs)) + else: + bma2 = np.array(rhs) + + expected = np.array(expected) + result = bma1 | bma2 + assert (result == expected).all() + + +@pytest.mark.parametrize("rhs_as_bitmask", [True, False]) +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], [True], [False]), + ([True], [False], [True]), + ([False], [False], [False]), + ([True] * 10, [True] * 10, [False] * 10), + ([False] * 10, [True] * 10, [True] * 10), + ], +) +def test_xor(rhs_as_bitmask, lhs, rhs, expected): + bma1 = BitMaskArray(np.array(lhs)) + + if rhs_as_bitmask: + bma2 = BitMaskArray(np.array(rhs)) + else: + bma2 = np.array(rhs) + + expected = np.array(expected) + result = bma1 ^ bma2 + assert (result == expected).all() + + +def test_pickle(): + parent = BitMaskArray(np.array([True, False, True])) + child = BitMaskArray(parent) + + result_child = pickle.loads(pickle.dumps(child)) + + assert result_child.shape == child.shape + assert result_child.bytes == child.bytes + + assert result_child.parent.shape == parent.shape + assert result_child.parent.bytes == parent.bytes + assert not result_child.parent.parent + + +def test_iter(): + bma = BitMaskArray(np.array([True, False, True])) + itr = iter(bma) + + assert next(itr) is True + assert next(itr) is False + assert next(itr) is True + + with pytest.raises(StopIteration, match=""): + next(itr) + + +@pytest.mark.parametrize( + "data,expected", + [ + (np.array([], dtype=bool), 0), + (np.array([True, False, True]), 3), + (np.array([True] * 8), 8), + (np.array([True] * 8 + [False]), 9), + ], +) +def test_size(data, expected): + bma = BitMaskArray(data) + result = bma.size + assert result == expected + + +@pytest.mark.parametrize( + "data,expected", + [ + (np.array([], dtype=bool), 0), + (np.array([True, False, True]), 1), + (np.array([True] * 8), 1), + (np.array([True] * 8 + [False]), 2), + ], +) +def test_nbytes(data, expected): + bma = BitMaskArray(data) + result = bma.nbytes + assert result == expected + + +@pytest.mark.parametrize( + "data", + [ + np.array([True, False]), + np.array([True, False]).reshape(2, -1), + np.array([True, False]).reshape(-1, 2), + ], +) +def test_shape(data): + bma = BitMaskArray(data) + assert bma.array_shape == data.shape + + +@pytest.mark.parametrize( + "data,expected", + [ + (np.array([], dtype=bool), False), + (np.array([True]), True), + (np.array([False]), False), + (np.array([True] * 8 + [False]), True), + ], +) +def test_any(data, expected): + bma = BitMaskArray(data) + assert bma.any() == expected + + +@pytest.mark.parametrize( + "data,expected", + [ + (np.array([], dtype=bool), True), + (np.array([True]), True), + (np.array([False]), False), + (np.array([True] * 8 + [False]), False), + ], +) +def test_all(data, expected): + bma = BitMaskArray(data) + assert bma.all() == expected + + +@pytest.mark.parametrize( + "data,expected", + [ + (np.array([], dtype=bool), 0), + (np.array([True]), 1), + (np.array([False]), 0), + (np.array([True] * 8 + [False]), 8), + ], +) +def test_sum(data, expected): + bma = BitMaskArray(data) + assert bma.sum() == expected + + +def test_take1d(): + bma = BitMaskArray(np.array([True, False, True, False])) + + result1 = bma.take_1d(np.array([0]), axis=0) + assert (result1.bytes[0] & 0x1) == 1 + + result2 = bma.take_1d(np.array([1]), axis=0) + assert (result2.bytes[0] & 0x1) == 0 + + result3 = bma.take_1d(np.array([0, 1]), axis=0) + assert (result3.bytes[0] & 0x1) == 1 + assert ((result3.bytes[0] >> 1) & 0x1) == 0 + + result4 = bma.take_1d(np.array([0, 0]), axis=0) + assert (result4.bytes[0] & 0x1) == 1 + assert ((result4.bytes[0] >> 1) & 0x1) == 1 + + result5 = bma.take_1d(np.array([3, 2, 1, 0]), axis=0) + assert (result5.bytes[0] & 0x1) == 0 + assert ((result5.bytes[0] >> 1) & 0x1) == 1 + assert ((result5.bytes[0] >> 2) & 0x1) == 0 + assert ((result5.bytes[0] >> 3) & 0x1) == 1 + + +def test_take1d_raises_not_axis0(): + bma = BitMaskArray(np.array([True, False, True])) + with pytest.raises(NotImplementedError, match="only implemented for axis=0"): + bma.take_1d(np.array([1]), axis=1) + + +def test_take_1d_raises_empty_indices(): + bma = BitMaskArray(np.array([True, False, True])) + with pytest.raises(NotImplementedError, match="does not support empty takes"): + bma.take_1d(np.array([], dtype="int64"), axis=0) + + +def test_take_1d_raises_negative_indices(): + bma = BitMaskArray(np.array([True, False, True])) + with pytest.raises(NotImplementedError, match="does not support negative indexing"): + bma.take_1d(np.array([-1], dtype="int64"), axis=0) + + +def test_copy(): + old_bma = BitMaskArray(np.array([True, False, True, False])) + bma = old_bma.copy() + + assert bma.bytes == old_bma.bytes + assert bma.shape == old_bma.shape + assert not bma.parent + + +@pytest.mark.parametrize( + "data", + [ + np.array([], dtype=bool), + np.array([True] * 100, dtype=bool), + np.array([[True, False], [True, False], [True, True], [False, False]]), + np.array([[True, False, True, False], [True, True, False, False]]), + ], +) +def test_to_numpy(data): + bma = BitMaskArray(data) + + result = bma.to_numpy() + tm.assert_numpy_array_equal(result, data) From 34d4ffcf3798930f49757891b41886b8dbe6b3d3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 25 Aug 2023 12:34:31 -0400 Subject: [PATCH 097/126] BitMask -> bitmask --- pandas/_libs/arrays.pyi | 2 +- pandas/_libs/arrays.pyx | 62 ++++++------- pandas/core/algorithms.py | 2 +- pandas/core/array_algos/masked_reductions.py | 22 ++--- pandas/core/arrays/boolean.py | 6 +- pandas/core/arrays/masked.py | 24 ++--- pandas/core/nanops.py | 4 +- pandas/core/ops/mask_ops.py | 26 +++--- .../arrays/floating/test_construction.py | 2 +- .../tests/arrays/integer/test_construction.py | 2 +- pandas/tests/arrays/masked/test_bitmask.py | 90 +++++++++---------- 11 files changed, 121 insertions(+), 121 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 643324bde838e..d890a59218235 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -42,7 +42,7 @@ class NDArrayBacked: cls, to_concat: Sequence[Self], axis: AxisInt = ... ) -> Self: ... -class BitMaskArray: +class BitmaskArray: parent: Self def __init__(self, data: np.ndarray | Self) -> None: ... def __setitem__(self, key: PositionalIndexer, value: ArrayLike | bool) -> None: ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index cd348bdebee2d..9c0186e4c221e 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -236,7 +236,7 @@ cdef class NDArrayBacked: return to_concat[0]._from_backing_data(new_arr) -cdef class BitMaskArray: +cdef class BitmaskArray: cdef: ArrowBitmap bitmap bint buffer_owner # set when parent is None, but gives C-level access @@ -257,7 +257,7 @@ cdef class BitMaskArray: self.buffer_owner = True self.bitmap = bitmap - cdef void init_from_bitmaskarray(self, BitMaskArray bma): + cdef void init_from_bitmaskarray(self, BitmaskArray bma): self.buffer_owner = False self.bitmap = bma.bitmap @@ -274,20 +274,20 @@ cdef class BitMaskArray: self.array_shape = data.array_shape self.parent = data else: - raise TypeError("Unsupported argument to BitMaskArray constructor") + raise TypeError("Unsupported argument to BitmaskArray constructor") def __dealloc__(self): if self.buffer_owner: ArrowBitmapReset(&self.bitmap) @staticmethod - cdef BitMaskArray copy_from_bitmaskarray(BitMaskArray old_bma): + cdef BitmaskArray copy_from_bitmaskarray(BitmaskArray old_bma): """ - Constructs a new BitMaskArray from a bitmap pointer. Copies data + Constructs a new BitmaskArray from a bitmap pointer. Copies data and manages the subsequenty lifecycle of the bitmap. """ # Bypass __init__ calls - cdef BitMaskArray bma = BitMaskArray.__new__(BitMaskArray) + cdef BitmaskArray bma = BitmaskArray.__new__(BitmaskArray) cdef uint8_t* buf cdef ArrowBitmap bitmap # TODO: this leaks a bit into the internals of the nanoarrow bitmap @@ -324,10 +324,10 @@ cdef class BitMaskArray: @cython.wraparound(False) @cython.boundscheck(False) @staticmethod - cdef BitMaskArray c_concatenate(list objs): + cdef BitmaskArray c_concatenate(list objs): cdef Py_ssize_t i cdef int64_t total_bits = 0 - cdef BitMaskArray current_bma + cdef BitmaskArray current_bma cdef Py_ssize_t nbitmaps = len(objs) cdef Py_ssize_t second_dim = 0 @@ -336,19 +336,19 @@ cdef class BitMaskArray: for obj in objs: if not obj.array_shape[1] == second_dim: raise NotImplementedError( - "BitMaskArray.concatenate does not support broadcasting" + "BitmaskArray.concatenate does not support broadcasting" ) cdef ArrowBitmap** bitmaps = malloc( sizeof(ArrowBitmap*) * nbitmaps ) for i in range(nbitmaps): - current_bma = objs[i] + current_bma = objs[i] total_bits += current_bma.bitmap.size_bits bitmaps[i] = ¤t_bma.bitmap # Bypass __init__ calls - cdef BitMaskArray bma = BitMaskArray.__new__(BitMaskArray) + cdef BitmaskArray bma = BitmaskArray.__new__(BitmaskArray) cdef ArrowBitmap bitmap ArrowBitmapInit(&bitmap) @@ -370,10 +370,10 @@ cdef class BitMaskArray: def concatenate(cls, objs, axis): if axis != 0: raise NotImplementedError( - "BitMaskArray.concatenate only implemented for axis=0" + "BitmaskArray.concatenate only implemented for axis=0" ) - return BitMaskArray.c_concatenate(objs) + return BitmaskArray.c_concatenate(objs) def __setitem__(self, key, value): cdef const uint8_t[:] keymask @@ -381,7 +381,7 @@ cdef class BitMaskArray: cdef Py_ssize_t i = 0 cdef Py_ssize_t ckey cdef bint cvalue - cdef BitMaskArray self_ = self + cdef BitmaskArray self_ = self if isinstance(key, int): ckey = key @@ -411,7 +411,7 @@ cdef class BitMaskArray: keymask.shape[0], value ) != 0: - raise ValueError("BitMaskArray.__setitem__ failed!") + raise ValueError("BitmaskArray.__setitem__ failed!") else: arr = self.to_numpy() arr[key] = value @@ -436,8 +436,8 @@ cdef class BitMaskArray: # bitmap only uses a few of the bits within that byte # the remaining bits of the byte are of undefined value # so be sure to only check bytes we need - cdef BitMaskArray self_ = self - cdef BitMaskArray bma = BitMaskArray.__new__(BitMaskArray) + cdef BitmaskArray self_ = self + cdef BitmaskArray bma = BitmaskArray.__new__(BitmaskArray) cdef ArrowBitmap bitmap ArrowBitmapInit(&bitmap) @@ -454,7 +454,7 @@ cdef class BitMaskArray: def __and__(self, other): cdef ndarray[uint8_t] result - cdef BitMaskArray other_bma, self_ = self # self_ required for Cython < 3 + cdef BitmaskArray other_bma, self_ = self # self_ required for Cython < 3 cdef ArrowBitmap bitmap if isinstance(other, type(self)): @@ -471,7 +471,7 @@ cdef class BitMaskArray: BitmapAnd(&self_.bitmap, &other_bma.bitmap, &bitmap) result = np.empty(self_.bitmap.size_bits, dtype=bool) - BitMaskArray.buffer_to_array_1d( + BitmaskArray.buffer_to_array_1d( result, bitmap.buffer.data, bitmap.size_bits @@ -483,7 +483,7 @@ cdef class BitMaskArray: def __or__(self, other): cdef ndarray[uint8_t] result - cdef BitMaskArray other_bma, self_ = self # self_ required for Cython < 3 + cdef BitmaskArray other_bma, self_ = self # self_ required for Cython < 3 cdef ArrowBitmap bitmap if isinstance(other, type(self)): @@ -500,7 +500,7 @@ cdef class BitMaskArray: BitmapOr(&self_.bitmap, &other_bma.bitmap, &bitmap) result = np.empty(self_.bitmap.size_bits, dtype=bool) - BitMaskArray.buffer_to_array_1d( + BitmaskArray.buffer_to_array_1d( result, bitmap.buffer.data, bitmap.size_bits @@ -512,7 +512,7 @@ cdef class BitMaskArray: def __xor__(self, other): cdef ndarray[uint8_t] result - cdef BitMaskArray other_bma, self_ = self # self_ required for Cython < 3 + cdef BitmaskArray other_bma, self_ = self # self_ required for Cython < 3 cdef ArrowBitmap bitmap if isinstance(other, type(self)): @@ -529,7 +529,7 @@ cdef class BitMaskArray: BitmapXor(&self_.bitmap, &other_bma.bitmap, &bitmap) result = np.empty(self_.bitmap.size_bits, dtype=bool) - BitMaskArray.buffer_to_array_1d( + BitmaskArray.buffer_to_array_1d( result, bitmap.buffer.data, bitmap.size_bits @@ -540,7 +540,7 @@ cdef class BitMaskArray: return self.to_numpy() ^ other def __getstate__(self): - cdef BitMaskArray self_ = self + cdef BitmaskArray self_ = self state = { "parent": self.parent, "array_shape": self.array_shape, @@ -563,7 +563,7 @@ cdef class BitMaskArray: def __setstate__(self, state): cdef ArrowBitmap bitmap - cdef BitMaskArray self_ = self, other + cdef BitmaskArray self_ = self, other self.parent = state["parent"] self.array_shape = state["array_shape"] self_.buffer_owner = state["buffer_owner"] @@ -595,7 +595,7 @@ cdef class BitMaskArray: @cython.wraparound(False) def __iter__(self): cdef Py_ssize_t i - cdef BitMaskArray self_ = self # self_ required for Cython < 3 + cdef BitmaskArray self_ = self # self_ required for Cython < 3 for i in range(self_.bitmap.size_bits): yield bool(ArrowBitGet(self_.bitmap.buffer.data, i)) @@ -640,11 +640,11 @@ cdef class BitMaskArray: const int64_t[:] indices, const int axis=0, ): - cdef BitMaskArray self_ = self + cdef BitmaskArray self_ = self cdef Py_ssize_t nindices = len(indices) if axis != 0: raise NotImplementedError( - "BitMaskArray.take_1d only implemented for axis=0" + "BitmaskArray.take_1d only implemented for axis=0" ) if nindices <= 0: @@ -653,7 +653,7 @@ cdef class BitMaskArray: ) cdef ArrowBitmap bitmap - cdef BitMaskArray bma = BitMaskArray.__new__(BitMaskArray) + cdef BitmaskArray bma = BitmaskArray.__new__(BitmaskArray) ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, nindices) @@ -668,7 +668,7 @@ cdef class BitMaskArray: return bma def copy(self): - return BitMaskArray.copy_from_bitmaskarray(self) + return BitmaskArray.copy_from_bitmaskarray(self) @cython.boundscheck(False) # TODO: Removing this causes an IndexError? Zero size? @cython.wraparound(False) @@ -678,7 +678,7 @@ cdef class BitMaskArray: def to_numpy(self) -> ndarray: cdef ndarray[uint8_t] result = np.empty(self.bitmap.size_bits, dtype=bool) - BitMaskArray.buffer_to_array_1d( + BitmaskArray.buffer_to_array_1d( result, self.bitmap.buffer.data, self.bitmap.size_bits diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 058e62e47d063..b4c592af9ff5f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1298,7 +1298,7 @@ def take( warnings.warn( "pd.api.extensions.take accepting non-standard inputs is deprecated " "and will raise in a future version. Pass either a numpy.ndarray, " - "ExtensionArray, Index, Series, or BitMaskArray instead.", + "ExtensionArray, Index, Series, or BitmaskArray instead.", FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index f426e57da4380..e7c39144fad35 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -13,7 +13,7 @@ import numpy as np from pandas._libs import missing as libmissing -from pandas._libs.arrays import BitMaskArray +from pandas._libs.arrays import BitmaskArray from pandas.core.nanops import check_below_min_count @@ -24,7 +24,7 @@ def _reductions( func: Callable, values: np.ndarray, - mask: BitMaskArray, + mask: BitmaskArray, *, skipna: bool = True, min_count: int = 0, @@ -65,7 +65,7 @@ def _reductions( def sum( values: np.ndarray, - mask: BitMaskArray, + mask: BitmaskArray, *, skipna: bool = True, min_count: int = 0, @@ -78,7 +78,7 @@ def sum( def prod( values: np.ndarray, - mask: BitMaskArray, + mask: BitmaskArray, *, skipna: bool = True, min_count: int = 0, @@ -92,7 +92,7 @@ def prod( def _minmax( func: Callable, values: np.ndarray, - mask: np.ndarray | BitMaskArray, + mask: np.ndarray | BitmaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -119,7 +119,7 @@ def _minmax( else: return func(values, axis=axis) else: - if isinstance(mask, BitMaskArray): + if isinstance(mask, BitmaskArray): subset = values[(~mask).to_numpy()] else: subset = values[~mask] @@ -132,7 +132,7 @@ def _minmax( def min( values: np.ndarray, - mask: np.ndarray | BitMaskArray, + mask: np.ndarray | BitmaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -142,7 +142,7 @@ def min( def max( values: np.ndarray, - mask: np.ndarray | BitMaskArray, + mask: np.ndarray | BitmaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -152,7 +152,7 @@ def max( def mean( values: np.ndarray, - mask: BitMaskArray, + mask: BitmaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -164,7 +164,7 @@ def mean( def var( values: np.ndarray, - mask: BitMaskArray, + mask: BitmaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -182,7 +182,7 @@ def var( def std( values: np.ndarray, - mask: BitMaskArray, + mask: BitmaskArray, *, skipna: bool = True, axis: AxisInt | None = None, diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index b8e8a99847383..9d0376169fe0d 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -28,7 +28,7 @@ if TYPE_CHECKING: import pyarrow - from pandas._libs.arrays import BitMaskArray + from pandas._libs.arrays import BitmaskArray from pandas._typing import ( Dtype, DtypeObj, @@ -301,14 +301,14 @@ class BooleanArray(BaseMaskedArray): @classmethod def _simple_new( - cls, values: np.ndarray, mask: npt.NDArray[np.bool_] | BitMaskArray + cls, values: np.ndarray, mask: npt.NDArray[np.bool_] | BitmaskArray ) -> Self: result = super()._simple_new(values, mask) result._dtype = BooleanDtype() return result def __init__( - self, values: np.ndarray, mask: np.ndarray | BitMaskArray, copy: bool = False + self, values: np.ndarray, mask: np.ndarray | BitmaskArray, copy: bool = False ) -> None: if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): raise TypeError( diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 89b3df8aa89f9..ae04f4a1174b6 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -15,7 +15,7 @@ lib, missing as libmissing, ) -from pandas._libs.arrays import BitMaskArray +from pandas._libs.arrays import BitmaskArray from pandas._libs.tslibs import ( get_unit_from_dtype, is_supported_unit, @@ -113,7 +113,7 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): _internal_fill_value: Scalar # our underlying data and mask are each ndarrays _data: np.ndarray - _mask: BitMaskArray + _mask: BitmaskArray # Fill values used for any/all _truthy_value = Scalar # bool(_truthy_value) = True @@ -121,26 +121,26 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): @classmethod def _simple_new( - cls, values: np.ndarray, mask: npt.NDArray[np.bool_] | BitMaskArray + cls, values: np.ndarray, mask: npt.NDArray[np.bool_] | BitmaskArray ) -> Self: result = BaseMaskedArray.__new__(cls) result._data = values - result._mask = BitMaskArray(mask) + result._mask = BitmaskArray(mask) return result def __init__( self, values: np.ndarray, - mask: npt.NDArray[np.bool_] | BitMaskArray, + mask: npt.NDArray[np.bool_] | BitmaskArray, copy: bool = False, ) -> None: # values is supposed to already be validated in the subclass if not ( - isinstance(mask, BitMaskArray) + isinstance(mask, BitmaskArray) or (isinstance(mask, np.ndarray) and mask.dtype == np.bool_) ): raise TypeError( - "mask should be boolean numpy array or BitMaskArray. " + "mask should be boolean numpy array or BitmaskArray. " "Use the 'pd.array' function instead" ) if isinstance(mask, np.ndarray): @@ -152,7 +152,7 @@ def __init__( mask = mask.copy() self._data = values - self._mask = BitMaskArray(mask) + self._mask = BitmaskArray(mask) @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: @@ -320,7 +320,7 @@ def __setitem__(self, key, value) -> None: value, mask = self._coerce_to_array(value, dtype=self.dtype) self._data[key] = value - if isinstance(mask, BitMaskArray): + if isinstance(mask, BitmaskArray): mask = mask.to_numpy() self._mask[key] = mask @@ -329,7 +329,7 @@ def __contains__(self, key) -> bool: if isna(key) and key is not self.dtype.na_value: # GH#52840 if self._data.dtype.kind == "f" and lib.is_float(key): - # TODO: implement low level invert operator on BitMaskArray + # TODO: implement low level invert operator on BitmaskArray return bool((np.isnan(self._data) & ~self._mask).any()) return bool(super().__contains__(key)) @@ -696,7 +696,7 @@ def _hasna(self) -> bool: return self._mask.any() def _propagate_mask( - self, mask: npt.NDArray[np.bool_] | BitMaskArray | None, other + self, mask: npt.NDArray[np.bool_] | BitmaskArray | None, other ) -> npt.NDArray[np.bool_]: if mask is None: mask = ( @@ -916,7 +916,7 @@ def _concat_same_type( ) -> Self: data = np.concatenate([x._data for x in to_concat], axis=axis) try: - mask = BitMaskArray.concatenate([x._mask for x in to_concat], axis=axis) + mask = BitmaskArray.concatenate([x._mask for x in to_concat], axis=axis) except NotImplementedError: mask = np.concatenate([x._mask.to_numpy() for x in to_concat], axis=axis) return cls(data, mask) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index babbb757c8a61..b27392ba39155 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -51,7 +51,7 @@ ) if TYPE_CHECKING: - from pandas._libs.arrays import BitMaskArray + from pandas._libs.arrays import BitmaskArray bn = import_optional_dependency("bottleneck", errors="warn") @@ -1543,7 +1543,7 @@ def _maybe_null_out( def check_below_min_count( shape: tuple[int, ...], - mask: npt.NDArray[np.bool_] | BitMaskArray | None, + mask: npt.NDArray[np.bool_] | BitmaskArray | None, min_count: int, ) -> bool: """ diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index 8136354659d6b..dfe64a37bbd30 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -9,14 +9,14 @@ lib, missing as libmissing, ) -from pandas._libs.arrays import BitMaskArray +from pandas._libs.arrays import BitmaskArray def kleene_or( left: bool | np.ndarray | libmissing.NAType, right: bool | np.ndarray | libmissing.NAType, - left_mask: np.ndarray | BitMaskArray | None, - right_mask: np.ndarray | BitMaskArray | None, + left_mask: np.ndarray | BitmaskArray | None, + right_mask: np.ndarray | BitmaskArray | None, ): """ Boolean ``or`` using Kleene logic. @@ -54,9 +54,9 @@ def kleene_or( result = left | right if right_mask is not None: - if isinstance(left_mask, BitMaskArray): + if isinstance(left_mask, BitmaskArray): left_mask = left_mask.to_numpy() - if isinstance(right_mask, BitMaskArray): + if isinstance(right_mask, BitmaskArray): right_mask = right_mask.to_numpy() # output is unknown where (False & NA), (NA & False), (NA & NA) left_false = ~(left | left_mask) @@ -70,7 +70,7 @@ def kleene_or( if right is True: mask = np.zeros(left_mask.shape, left_mask.dtype) else: - if isinstance(left_mask, BitMaskArray): + if isinstance(left_mask, BitmaskArray): left_mask = left_mask.to_numpy() if right is libmissing.NA: mask = (~left & ~left_mask) | left_mask @@ -83,8 +83,8 @@ def kleene_or( def kleene_xor( left: bool | np.ndarray | libmissing.NAType, right: bool | np.ndarray | libmissing.NAType, - left_mask: np.ndarray | BitMaskArray | None, - right_mask: np.ndarray | BitMaskArray | None, + left_mask: np.ndarray | BitmaskArray | None, + right_mask: np.ndarray | BitmaskArray | None, ): """ Boolean ``xor`` using Kleene logic. @@ -126,7 +126,7 @@ def kleene_xor( if right is libmissing.NA: mask = np.ones(left_mask.shape, left_mask.dtype) else: - if isinstance(left_mask, BitMaskArray): + if isinstance(left_mask, BitmaskArray): mask = left_mask.to_numpy() else: mask = left_mask.copy() @@ -139,8 +139,8 @@ def kleene_xor( def kleene_and( left: bool | libmissing.NAType | np.ndarray, right: bool | libmissing.NAType | np.ndarray, - left_mask: np.ndarray | BitMaskArray | None, - right_mask: np.ndarray | BitMaskArray | None, + left_mask: np.ndarray | BitmaskArray | None, + right_mask: np.ndarray | BitmaskArray | None, ): """ Boolean ``and`` using Kleene logic. @@ -190,9 +190,9 @@ def kleene_and( else: # Since we must compare to left / right it helps perf to convert # to numpy up front, rather than deferring multiple times - if isinstance(left_mask, BitMaskArray): + if isinstance(left_mask, BitmaskArray): left_mask = left_mask.to_numpy() - if isinstance(right_mask, BitMaskArray): + if isinstance(right_mask, BitmaskArray): right_mask = right_mask.to_numpy() # unmask where either left or right is False diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 3e9b669913749..b9d640620655f 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -26,7 +26,7 @@ def test_floating_array_constructor(): tm.assert_numpy_array_equal(result._mask.to_numpy(), mask) msg = ( - r".* should be .* numpy array( or BitMaskArray)?. " + r".* should be .* numpy array( or BitmaskArray)?. " r"Use the 'pd.array' function instead" ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index 6cc240cd52aca..9e8f941794d28 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -81,7 +81,7 @@ def test_integer_array_constructor(): tm.assert_extension_array_equal(result, expected) msg = ( - r".* should be .* numpy array( or BitMaskArray)?. " + r".* should be .* numpy array( or BitmaskArray)?. " r"Use the 'pd.array' function instead" ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py index 1e1258984edaf..153078de7c32d 100644 --- a/pandas/tests/arrays/masked/test_bitmask.py +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from pandas._libs.arrays import BitMaskArray +from pandas._libs.arrays import BitmaskArray import pandas._testing as tm @@ -20,7 +20,7 @@ ], ) def test_constructor_ndarray(array, expected): - bma = BitMaskArray(array) + bma = BitmaskArray(array) assert bma.bytes == expected assert not bma.parent assert bma.array_shape == array.shape @@ -29,27 +29,27 @@ def test_constructor_ndarray(array, expected): @pytest.mark.parametrize( "parent,expected", [ - (BitMaskArray(np.array([False, False])), bytes([0x0])), - (BitMaskArray(np.array([True, False])), bytes([0x1])), - (BitMaskArray(np.array([False, True])), bytes([0x2])), - (BitMaskArray(np.array([True, True])), bytes([0x3])), - (BitMaskArray(np.array([True, False] * 8)), bytes([0x55, 0x55])), + (BitmaskArray(np.array([False, False])), bytes([0x0])), + (BitmaskArray(np.array([True, False])), bytes([0x1])), + (BitmaskArray(np.array([False, True])), bytes([0x2])), + (BitmaskArray(np.array([True, True])), bytes([0x3])), + (BitmaskArray(np.array([True, False] * 8)), bytes([0x55, 0x55])), ], ) def test_constructor_bitmap(parent, expected): - bma = BitMaskArray(parent) + bma = BitmaskArray(parent) assert bma.bytes == expected assert bma.parent is parent assert bma.array_shape == parent.shape def test_len(): - bma = BitMaskArray(np.array([True, False, False])) + bma = BitmaskArray(np.array([True, False, False])) assert len(bma) == 3 def test_repr_no_parent(): - bma = BitMaskArray(np.array([True, False, False])) + bma = BitmaskArray(np.array([True, False, False])) result = repr(bma) assert "parent: None" in result assert "shape: (3,)" in result @@ -57,11 +57,11 @@ def test_repr_no_parent(): def test_repr_parent(): - parent = BitMaskArray(np.array([False, False, True])) - bma = BitMaskArray(parent) + parent = BitmaskArray(np.array([False, False, True])) + bma = BitmaskArray(parent) result = repr(bma) parent_id = hex(id(parent)) - assert f"parent: > 1) & 0x1) == 1 - result2 = ~BitMaskArray(np.array([False, True])) + result2 = ~BitmaskArray(np.array([False, True])) assert (result2.bytes[0] & 0x1) == 1 assert ((result2.bytes[0] >> 1) & 0x1) == 0 @@ -183,10 +183,10 @@ def test_invert(): ], ) def test_and(rhs_as_bitmask, lhs, rhs, expected): - bma1 = BitMaskArray(np.array(lhs)) + bma1 = BitmaskArray(np.array(lhs)) if rhs_as_bitmask: - bma2 = BitMaskArray(np.array(rhs)) + bma2 = BitmaskArray(np.array(rhs)) else: bma2 = np.array(rhs) @@ -207,10 +207,10 @@ def test_and(rhs_as_bitmask, lhs, rhs, expected): ], ) def test_or(rhs_as_bitmask, lhs, rhs, expected): - bma1 = BitMaskArray(np.array(lhs)) + bma1 = BitmaskArray(np.array(lhs)) if rhs_as_bitmask: - bma2 = BitMaskArray(np.array(rhs)) + bma2 = BitmaskArray(np.array(rhs)) else: bma2 = np.array(rhs) @@ -231,10 +231,10 @@ def test_or(rhs_as_bitmask, lhs, rhs, expected): ], ) def test_xor(rhs_as_bitmask, lhs, rhs, expected): - bma1 = BitMaskArray(np.array(lhs)) + bma1 = BitmaskArray(np.array(lhs)) if rhs_as_bitmask: - bma2 = BitMaskArray(np.array(rhs)) + bma2 = BitmaskArray(np.array(rhs)) else: bma2 = np.array(rhs) @@ -244,8 +244,8 @@ def test_xor(rhs_as_bitmask, lhs, rhs, expected): def test_pickle(): - parent = BitMaskArray(np.array([True, False, True])) - child = BitMaskArray(parent) + parent = BitmaskArray(np.array([True, False, True])) + child = BitmaskArray(parent) result_child = pickle.loads(pickle.dumps(child)) @@ -258,7 +258,7 @@ def test_pickle(): def test_iter(): - bma = BitMaskArray(np.array([True, False, True])) + bma = BitmaskArray(np.array([True, False, True])) itr = iter(bma) assert next(itr) is True @@ -279,7 +279,7 @@ def test_iter(): ], ) def test_size(data, expected): - bma = BitMaskArray(data) + bma = BitmaskArray(data) result = bma.size assert result == expected @@ -294,7 +294,7 @@ def test_size(data, expected): ], ) def test_nbytes(data, expected): - bma = BitMaskArray(data) + bma = BitmaskArray(data) result = bma.nbytes assert result == expected @@ -308,7 +308,7 @@ def test_nbytes(data, expected): ], ) def test_shape(data): - bma = BitMaskArray(data) + bma = BitmaskArray(data) assert bma.array_shape == data.shape @@ -322,7 +322,7 @@ def test_shape(data): ], ) def test_any(data, expected): - bma = BitMaskArray(data) + bma = BitmaskArray(data) assert bma.any() == expected @@ -336,7 +336,7 @@ def test_any(data, expected): ], ) def test_all(data, expected): - bma = BitMaskArray(data) + bma = BitmaskArray(data) assert bma.all() == expected @@ -350,12 +350,12 @@ def test_all(data, expected): ], ) def test_sum(data, expected): - bma = BitMaskArray(data) + bma = BitmaskArray(data) assert bma.sum() == expected def test_take1d(): - bma = BitMaskArray(np.array([True, False, True, False])) + bma = BitmaskArray(np.array([True, False, True, False])) result1 = bma.take_1d(np.array([0]), axis=0) assert (result1.bytes[0] & 0x1) == 1 @@ -379,25 +379,25 @@ def test_take1d(): def test_take1d_raises_not_axis0(): - bma = BitMaskArray(np.array([True, False, True])) + bma = BitmaskArray(np.array([True, False, True])) with pytest.raises(NotImplementedError, match="only implemented for axis=0"): bma.take_1d(np.array([1]), axis=1) def test_take_1d_raises_empty_indices(): - bma = BitMaskArray(np.array([True, False, True])) + bma = BitmaskArray(np.array([True, False, True])) with pytest.raises(NotImplementedError, match="does not support empty takes"): bma.take_1d(np.array([], dtype="int64"), axis=0) def test_take_1d_raises_negative_indices(): - bma = BitMaskArray(np.array([True, False, True])) + bma = BitmaskArray(np.array([True, False, True])) with pytest.raises(NotImplementedError, match="does not support negative indexing"): bma.take_1d(np.array([-1], dtype="int64"), axis=0) def test_copy(): - old_bma = BitMaskArray(np.array([True, False, True, False])) + old_bma = BitmaskArray(np.array([True, False, True, False])) bma = old_bma.copy() assert bma.bytes == old_bma.bytes @@ -415,7 +415,7 @@ def test_copy(): ], ) def test_to_numpy(data): - bma = BitMaskArray(data) + bma = BitmaskArray(data) result = bma.to_numpy() tm.assert_numpy_array_equal(result, data) From 0d78ac39b848295337cfca923f0a19407e7b656b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 25 Aug 2023 12:35:47 -0400 Subject: [PATCH 098/126] fix error type --- pandas/tests/arrays/masked/test_bitmask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py index 153078de7c32d..746a64c626aef 100644 --- a/pandas/tests/arrays/masked/test_bitmask.py +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -392,7 +392,7 @@ def test_take_1d_raises_empty_indices(): def test_take_1d_raises_negative_indices(): bma = BitmaskArray(np.array([True, False, True])) - with pytest.raises(NotImplementedError, match="does not support negative indexing"): + with pytest.raises(ValueError, match="does not support negative indexing"): bma.take_1d(np.array([-1], dtype="int64"), axis=0) From d40a1d8c16a22a74cc8057247a558140dd2a76df Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 25 Aug 2023 12:50:46 -0400 Subject: [PATCH 099/126] less to_numpy --- pandas/core/array_algos/masked_reductions.py | 2 +- pandas/core/arrays/boolean.py | 2 +- pandas/core/arrays/numeric.py | 2 +- pandas/tests/arrays/boolean/test_construction.py | 3 +-- pandas/tests/arrays/floating/test_arithmetic.py | 4 ++-- pandas/tests/arrays/floating/test_construction.py | 1 - pandas/tests/arrays/integer/test_construction.py | 1 - pandas/tests/arrays/integer/test_function.py | 2 +- pandas/tests/arrays/masked_shared.py | 2 +- 9 files changed, 8 insertions(+), 11 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index e7c39144fad35..3e34fb03d657e 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -60,7 +60,7 @@ def _reductions( ): return libmissing.NA - return func(values, where=~mask.to_numpy(), axis=axis, **kwargs) + return func(values, where=(~mask).to_numpy(), axis=axis, **kwargs) def sum( diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 9d0376169fe0d..ab0c81aa68c34 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -397,7 +397,7 @@ def _accumulate( self, name: str, *, skipna: bool = True, **kwargs ) -> BaseMaskedArray: data = self._data - mask = self._mask.to_numpy() + mask = self._mask if name in ("cummin", "cummax"): op = getattr(masked_accumulations, name) data, mask = op(data, mask, skipna=skipna, **kwargs) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 897ae8a89c73c..864fc2c975c86 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -146,7 +146,7 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype cls = dtype_cls.construct_array_type() if isinstance(values, cls): - values, mask = values._data, values._mask.to_numpy() + values, mask = values._data, values._mask if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index 37745f589e26d..12378cf719065 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -40,7 +40,6 @@ def test_boolean_array_constructor_copy(): result = BooleanArray(values, mask) assert result._data is values - # assert result._mask is mask result = BooleanArray(values, mask, copy=True) assert result._data is not values @@ -159,7 +158,7 @@ def test_coerce_to_array(): expected = BooleanArray(values, mask) tm.assert_extension_array_equal(result, expected) assert result._data is values - # assert result._mask is mask + result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) expected = BooleanArray(values, mask) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index f7fd08361f5e1..052f38dfce5af 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -67,7 +67,7 @@ def test_pow_scalar(dtype): # TODO np.nan should be converted to pd.NA / missing before operation? expected = FloatingArray( np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), - mask=a._mask.to_numpy(), + mask=a._mask, ) tm.assert_extension_array_equal(result, expected) @@ -89,7 +89,7 @@ def test_pow_scalar(dtype): result = np.nan**a expected = FloatingArray( np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), - mask=a._mask.to_numpy(), + mask=a._mask, ) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index b9d640620655f..5a58125a1c126 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -65,7 +65,6 @@ def test_floating_array_constructor_copy(): result = FloatingArray(values, mask) assert result._data is values - # assert result._mask is mask result = FloatingArray(values, mask, copy=True) assert result._data is not values diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index 9e8f941794d28..d442a26c9c4dc 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -103,7 +103,6 @@ def test_integer_array_constructor_copy(): result = IntegerArray(values, mask) assert result._data is values - # assert result._mask is mask result = IntegerArray(values, mask, copy=True) assert result._data is not values diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 40c9dcc697f46..d48b636a98feb 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -26,7 +26,7 @@ def test_ufuncs_single_float(ufunc): a = pd.array([1, 2, -3, np.nan]) with np.errstate(invalid="ignore"): result = ufunc(a) - expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask.to_numpy()) + expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask) tm.assert_extension_array_equal(result, expected) s = pd.Series(a) diff --git a/pandas/tests/arrays/masked_shared.py b/pandas/tests/arrays/masked_shared.py index 22caeb94a13a1..78726b2a90471 100644 --- a/pandas/tests/arrays/masked_shared.py +++ b/pandas/tests/arrays/masked_shared.py @@ -43,7 +43,7 @@ def test_scalar(self, other, comparison_op, dtype): expected = pd.array([None, None, None], dtype="boolean") else: values = op(left._data, other) - expected = pd.arrays.BooleanArray(values, left._mask.to_numpy(), copy=True) + expected = pd.arrays.BooleanArray(values, left._mask, copy=True) tm.assert_extension_array_equal(result, expected) # ensure we haven't mutated anything inplace From 35da3f6469e20191e17c42bbe0a38ae852408ec3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 25 Aug 2023 14:42:35 -0400 Subject: [PATCH 100/126] licenses --- pandas/_libs/include/pandas/bitmask_algorithms.h | 2 ++ pandas/_libs/src/bitmask_algorithms.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pandas/_libs/include/pandas/bitmask_algorithms.h b/pandas/_libs/include/pandas/bitmask_algorithms.h index 3ca086acf77a6..f29fddf6551ed 100644 --- a/pandas/_libs/include/pandas/bitmask_algorithms.h +++ b/pandas/_libs/include/pandas/bitmask_algorithms.h @@ -1,3 +1,5 @@ +// The full license is in the LICENSE file, distributed with this software. + #pragma once #include diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index e74f7a2aa6ab3..d78d8b973a789 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -1,3 +1,5 @@ +// The full license is in the LICENSE file, distributed with this software. + #include #include "pandas/bitmask_algorithms.h" From 5b7d0c2d49b7ba7018b5fde087f0d97535f7cff1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 25 Aug 2023 14:46:54 -0400 Subject: [PATCH 101/126] typing fixes --- pandas/core/arrays/numeric.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 864fc2c975c86..38cb1ccc14b7f 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -32,6 +32,7 @@ import pyarrow + from pandas._libs.arrays import BitmaskArray from pandas._typing import ( Dtype, DtypeObj, @@ -232,7 +233,7 @@ class NumericArray(BaseMaskedArray): def __init__( self, values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitmaskArray, copy: bool = False, ) -> None: checker = self._dtype_cls._checker From e08a6471708c1d7e8ae21a0b86962d091f293aa0 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 26 Aug 2023 17:24:45 -0400 Subject: [PATCH 102/126] buffer protocol implementation for BitmaskArray --- pandas/_libs/arrays.pyi | 7 +- pandas/_libs/arrays.pyx | 205 ++++++++++++++++--- pandas/_libs/hashtable.pyi | 3 +- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- pandas/core/algorithms.py | 20 +- pandas/core/array_algos/masked_reductions.py | 9 +- pandas/core/array_algos/quantile.py | 7 +- pandas/core/arrays/masked.py | 70 +++---- pandas/core/groupby/groupby.py | 5 +- pandas/core/missing.py | 4 +- pandas/core/reshape/merge.py | 4 +- pandas/tests/arrays/boolean/test_logical.py | 4 +- pandas/tests/arrays/masked/test_bitmask.py | 57 ++++-- pandas/tests/arrays/masked_shared.py | 2 +- 14 files changed, 284 insertions(+), 115 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index d890a59218235..7a6d16c8cefd7 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -45,12 +45,13 @@ class NDArrayBacked: class BitmaskArray: parent: Self def __init__(self, data: np.ndarray | Self) -> None: ... + def __len__(self) -> int: ... def __setitem__(self, key: PositionalIndexer, value: ArrayLike | bool) -> None: ... def __getitem__(self, key: PositionalIndexer) -> bool: ... def __invert__(self) -> Self: ... - def __and__(self, other: np.ndarray | Self) -> np.ndarray: ... - def __or__(self, other: np.ndarray | Self) -> np.ndarray: ... - def __xor__(self, other: np.ndarray | Self) -> np.ndarray: ... + def __and__(self, other: np.ndarray | Self | bool) -> np.ndarray: ... + def __or__(self, other: np.ndarray | Self | bool) -> np.ndarray: ... + def __xor__(self, other: np.ndarray | Self | bool) -> np.ndarray: ... def __getstate__(self) -> dict: ... def __setstate__(self, other: dict) -> None: ... def __iter__(self): ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 9c0186e4c221e..df1de3cfb035f 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -240,8 +240,14 @@ cdef class BitmaskArray: cdef: ArrowBitmap bitmap bint buffer_owner # set when parent is None, but gives C-level access + # NumPy compatibility + cdef Py_ssize_t ndim + cdef Py_ssize_t[2] shape + cdef Py_ssize_t[2] strides + # Buffer protocol support + int n_consumers + uint8_t* memview_buf cdef public: - object array_shape object parent # assignments gives RC to ensure proper buffer lifecycle @cython.boundscheck(False) @@ -254,24 +260,41 @@ cdef class BitmaskArray: ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, nobs) ArrowBitmapAppendInt8Unsafe(&bitmap, &arr[0], nobs) - self.buffer_owner = True self.bitmap = bitmap + self.buffer_owner = True cdef void init_from_bitmaskarray(self, BitmaskArray bma): - self.buffer_owner = False self.bitmap = bma.bitmap + self.buffer_owner = False + self.ndim = bma.ndim + self.shape[0] = bma.shape[0] + self.strides[0] = bma.strides[0] + if self.ndim == 2: + self.shape[1] = bma.shape[1] + self.strides[1] = bma.strides[1] def __cinit__(self): + cdef BitmaskArray self_ = self self.parent = False + self_.n_consumers = 0 + self_.memview_buf = NULL def __init__(self, data): + cdef BitmaskArray self_ = self if isinstance(data, np.ndarray): + if not data.flags["C_CONTIGUOUS"]: + data = np.ascontiguousarray(data) + self.init_from_ndarray(data.ravel()) - self.array_shape = data.shape self.parent = None + self_.ndim = data.ndim + self_.shape[0] = data.shape[0] + self_.strides[0] = data.strides[0] + if (data.ndim == 2): + self_.shape[1] = data.shape[1] + self_.strides[1] = data.strides[1] elif isinstance(data, type(self)): self.init_from_bitmaskarray(data) - self.array_shape = data.array_shape self.parent = data else: raise TypeError("Unsupported argument to BitmaskArray constructor") @@ -301,8 +324,12 @@ cdef class BitmaskArray: bitmap.buffer.data = buf bma.bitmap = bitmap - bma.array_shape = old_bma.array_shape bma.buffer_owner = True + bma.ndim = old_bma.ndim + bma.shape = old_bma.shape + bma.strides = old_bma.strides + bma.parent = False + return bma def __len__(self): @@ -314,11 +341,10 @@ cdef class BitmaskArray: else: par = None - shape = self.array_shape data = self.bytes return ( - f"{object.__repr__(self)}\nparent: {par}\nshape: {shape}\ndata: {data}\n" + f"{object.__repr__(self)}\nparent: {par}\ndata: {data}\n" ) @cython.wraparound(False) @@ -330,22 +356,39 @@ cdef class BitmaskArray: cdef BitmaskArray current_bma cdef Py_ssize_t nbitmaps = len(objs) - cdef Py_ssize_t second_dim = 0 - if any(len(x.array_shape) > 1 for x in objs): - second_dim = objs[0].array_shape[1] - for obj in objs: - if not obj.array_shape[1] == second_dim: - raise NotImplementedError( - "BitmaskArray.concatenate does not support broadcasting" - ) + cdef BitmaskArray first_bma = objs[0] + cdef int expected_ndim = first_bma.ndim + cdef Py_ssize_t expected_stride0 = first_bma.strides[0] + cdef Py_ssize_t expected_shape1, expected_stride1 + if expected_ndim == 2: + expected_stride1 = first_bma.strides[1] + expected_shape1 = first_bma.shape[1] + + cdef Py_ssize_t dim0shape = 0 cdef ArrowBitmap** bitmaps = malloc( sizeof(ArrowBitmap*) * nbitmaps ) + for i in range(nbitmaps): current_bma = objs[i] + if ( + current_bma.ndim != expected_ndim + or current_bma.strides[0] != expected_stride0 + or ( + expected_ndim == 2 and ( + current_bma.shape[1] != expected_shape1 + or current_bma.strides[1] != expected_stride1 + ) + ) + ): + free(bitmaps) + raise NotImplementedError( + "BitmaskArray.concatenate does not support broadcasting" + ) total_bits += current_bma.bitmap.size_bits bitmaps[i] = ¤t_bma.bitmap + dim0shape += current_bma.shape[0] # Bypass __init__ calls cdef BitmaskArray bma = BitmaskArray.__new__(BitmaskArray) @@ -357,11 +400,15 @@ cdef class BitmaskArray: free(bitmaps) bma.bitmap = bitmap - if second_dim != 0: - bma.array_shape = tuple((total_bits // second_dim, second_dim)) - else: - bma.array_shape = tuple((total_bits,)) bma.buffer_owner = True + + bma.ndim = expected_ndim + bma.shape[0] = dim0shape # only allowed because of axis=0 assumption + bma.strides[0] = expected_stride0 + if expected_ndim == 2: + bma.shape[1] = expected_shape1 + bma.strides[1] = expected_stride1 + bma.parent = None return bma @@ -446,8 +493,10 @@ cdef class BitmaskArray: BitmapInvert(&self_.bitmap, &bitmap) bma.bitmap = bitmap - bma.array_shape = self.array_shape bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides bma.parent = None return bma @@ -461,7 +510,10 @@ cdef class BitmaskArray: # TODO: maybe should return Self here instead of ndarray other_bma = other if self_.bitmap.size_bits == 0: - return np.empty(dtype=bool).reshape(self.array_shape) + result = np.empty([], dtype=bool) + if self_.ndim == 2: + return result.reshape(self_.shape[0], self_.shape[1]) + return result if self_.bitmap.size_bits != other_bma.bitmap.size_bits: raise ValueError("bitmaps are not equal size") @@ -477,7 +529,10 @@ cdef class BitmaskArray: bitmap.size_bits ) ArrowBitmapReset(&bitmap) - return result.reshape(self.array_shape) + + if self_.ndim == 2: + return result.reshape(self_.shape[0], self_.shape[1]) + return result return self.to_numpy() & other @@ -490,7 +545,10 @@ cdef class BitmaskArray: # TODO: maybe should return Self here instead of ndarray other_bma = other if self_.bitmap.size_bits == 0: - return np.empty(dtype=bool).reshape(self.array_shape) + result = np.empty([], dtype=bool) + if self_.ndim == 2: + return result.reshape(self_.shape[0], self_.shape[1]) + return result if self_.bitmap.size_bits != other_bma.bitmap.size_bits: raise ValueError("bitmaps are not equal size") @@ -506,7 +564,10 @@ cdef class BitmaskArray: bitmap.size_bits ) ArrowBitmapReset(&bitmap) - return result.reshape(self.array_shape) + + if self_.ndim == 2: + return result.reshape(self_.shape[0], self_.shape[1]) + return result return self.to_numpy() | other @@ -519,7 +580,10 @@ cdef class BitmaskArray: # TODO: maybe should return Self here instead of ndarray other_bma = other if self_.bitmap.size_bits == 0: - return np.empty(dtype=bool).reshape(self.array_shape) + result = np.empty([], dtype=bool) + if self_.ndim == 2: + return result.reshape(self_.shape[0], self_.shape[1]) + return result if self_.bitmap.size_bits != other_bma.bitmap.size_bits: raise ValueError("bitmaps are not equal size") @@ -535,7 +599,9 @@ cdef class BitmaskArray: bitmap.size_bits ) ArrowBitmapReset(&bitmap) - return result.reshape(self.array_shape) + if self_.ndim == 2: + return result.reshape(self_.shape[0], self_.shape[1]) + return result return self.to_numpy() ^ other @@ -543,7 +609,10 @@ cdef class BitmaskArray: cdef BitmaskArray self_ = self state = { "parent": self.parent, - "array_shape": self.array_shape, + "ndim": self_.ndim, + "shape0": self_.shape[0], + "stride0": self_.strides[0], + "n_consumers": self_.n_consumers, "buffer_owner": self_.buffer_owner, # Private ArrowBitmap attributes below "bitmap.buffer.size_bytes": self_.bitmap.buffer.size_bytes, @@ -551,6 +620,18 @@ cdef class BitmaskArray: "bitmap.size_bits": self_.bitmap.size_bits } + if self_.ndim == 2: + state["shape1"] = self_.shape[1] + state["stride1"] = self_.strides[1] + + # memview should only exist when n_consumers > 0 + if self_.n_consumers > 0: + memview_buf_data = bytearray(len(self)) + for i in range(len(self)): + memview_buf_data[i] = self_.memview_buf[i] + + state["memview_buf_data"] = memview_buf_data + # Only parents own data if self_.buffer_owner: bitmap_data = bytearray(self_.bitmap.buffer.size_bytes) @@ -565,12 +646,26 @@ cdef class BitmaskArray: cdef ArrowBitmap bitmap cdef BitmaskArray self_ = self, other self.parent = state["parent"] - self.array_shape = state["array_shape"] + self_.ndim = state["ndim"] + self_.shape[0] = state["shape0"] + self_.strides[0] = state["stride0"] + self_.n_consumers = state["n_consumers"] self_.buffer_owner = state["buffer_owner"] nbytes = state["bitmap.buffer.size_bytes"] capacity_bytes = state["bitmap.buffer.capacity_bytes"] nbits = state["bitmap.size_bits"] + + if self_.ndim == 2: + self_.shape[1] = state["shape1"] + self_.strides[1] = state["stride1"] + + if self_.n_consumers > 0: + self_.memview_buf = malloc(nbits) + memview_buf_data = state["memview_buf_data"] + for i in range(nbits): + self_.memview_buf[i] = memview_buf_data[i] + if not self_.buffer_owner: other = self.parent self_.bitmap = other.bitmap @@ -599,6 +694,38 @@ cdef class BitmaskArray: for i in range(self_.bitmap.size_bits): yield bool(ArrowBitGet(self_.bitmap.buffer.data, i)) + def __getbuffer__(self, Py_buffer *buffer, int flags): + cdef BitmaskArray self_ = self + + if self_.n_consumers == 0: + self_.memview_buf = malloc(self_.bitmap.size_bits) + ArrowBitsUnpackInt8( + self_.bitmap.buffer.data, + 0, + self_.bitmap.size_bits, + self_.memview_buf + ) + + buffer.buf = self_.memview_buf + buffer.format = "?" + buffer.internal = NULL + buffer.itemsize = 1 + buffer.len = self_.bitmap.size_bits + buffer.ndim = self_.ndim + buffer.obj = self + buffer.readonly = 1 + buffer.shape = self_.shape + buffer.strides = self_.strides + buffer.suboffsets = NULL + + self_.n_consumers += 1 + + def __releasebuffer__(self, Py_buffer *buffer): + cdef BitmaskArray self_ = self + self_.n_consumers -= 1 + if self_.n_consumers == 0: + free(self_.memview_buf) + @property def size(self) -> int: return self.bitmap.size_bits @@ -619,12 +746,15 @@ cdef class BitmaskArray: @property def shape(self): """Strictly for NumPy compat in mask_ops""" - return self.array_shape + cdef BitmaskArray self_ = self + if self_.ndim == 1: + return tuple((self_.shape[0],)) + return tuple((self_.shape[0], self_.shape[1])) @property def dtype(self): """Strictly for NumPy compat in mask_ops""" - return bool + return np.dtype("bool") def any(self) -> bool: return BitmapAny(&self.bitmap) @@ -663,8 +793,14 @@ cdef class BitmaskArray: raise ValueError("take_1d does not support negative indexing") bma.bitmap = bitmap - bma.array_shape = tuple((indices.shape[0],)) bma.buffer_owner = True + + bma.ndim = self_.ndim + bma.shape[0] = indices.shape[0] + bma.strides = self_.strides + + bma.parent = None + return bma def copy(self): @@ -677,6 +813,7 @@ cdef class BitmaskArray: ArrowBitsUnpackInt8(buf, 0, size, &out[0]) def to_numpy(self) -> ndarray: + cdef BitmaskArray self_ = self cdef ndarray[uint8_t] result = np.empty(self.bitmap.size_bits, dtype=bool) BitmaskArray.buffer_to_array_1d( result, @@ -684,4 +821,6 @@ cdef class BitmaskArray: self.bitmap.size_bits ) - return result.reshape(self.array_shape) + if self_.ndim == 2: + return result.reshape(self_.shape[0], self_.shape[1]) + return result diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 2bc6d74fe6aee..cd51875eb3df1 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -6,6 +6,7 @@ from typing import ( import numpy as np +from pandas._libs.arrays import BitmaskArray from pandas._typing import npt def unique_label_indices( @@ -239,7 +240,7 @@ def mode( def value_count( values: np.ndarray, dropna: bool, - mask: npt.NDArray[np.bool_] | None = ..., + mask: npt.NDArray[np.bool_] | BitmaskArray | None = ..., ) -> tuple[np.ndarray, npt.NDArray[np.int64]]: ... # np.ndarray[same-as-values] # arr and values should have same dtype diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 1cf5d734705af..4a94e5c256eb3 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -665,7 +665,7 @@ cdef class {{name}}HashTable(HashTable): rmd = result_mask.data if use_mask: - mask_values = mask.view("uint8") + mask_values = mask if use_na_value: # We need this na_value2 because we want to allow users diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b4c592af9ff5f..53a75754a49c2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -21,6 +21,7 @@ iNaT, lib, ) +from pandas._libs.arrays import BitmaskArray from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -424,7 +425,7 @@ def nunique_ints(values: ArrayLike) -> int: return result -def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): +def unique_with_mask(values, mask: npt.NDArray[np.bool_] | BitmaskArray | None = None): """See algorithms.unique for docs. Takes a mask for masked arrays.""" values = _ensure_arraylike(values, func_name="unique") @@ -442,10 +443,9 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): return uniques else: - uniques, mask = table.unique(values, mask=mask) + uniques, np_mask = table.unique(values, mask=mask) uniques = _reconstruct_data(uniques, original.dtype, original) - assert mask is not None # for mypy - return uniques, mask.astype("bool") + return uniques, np_mask.astype("bool") unique1d = unique @@ -550,7 +550,7 @@ def factorize_array( use_na_sentinel: bool = True, size_hint: int | None = None, na_value: object = None, - mask: npt.NDArray[np.bool_] | None = None, + mask: npt.NDArray[np.bool_] | BitmaskArray | None = None, ) -> tuple[npt.NDArray[np.intp], np.ndarray]: """ Factorize a numpy array to codes and uniques. @@ -946,7 +946,9 @@ def value_counts_internal( # Called once from SparseArray, otherwise could be private def value_counts_arraylike( - values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None + values: np.ndarray, + dropna: bool, + mask: npt.NDArray[np.bool_] | BitmaskArray | None = None, ) -> tuple[ArrayLike, npt.NDArray[np.int64]]: """ Parameters @@ -970,7 +972,7 @@ def value_counts_arraylike( if dropna: mask = keys != iNaT - keys, counts = keys[mask], counts[mask] + keys, counts = keys[mask], counts[mask] # type: ignore[index] res_keys = _reconstruct_data(keys, original.dtype, original) return res_keys, counts @@ -1293,7 +1295,9 @@ def take( ... fill_value=-10) array([ 10, 10, -10]) """ - if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)): + if not isinstance( + arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries, BitmaskArray) + ): # GH#52981 warnings.warn( "pd.api.extensions.take accepting non-standard inputs is deprecated " diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 3e34fb03d657e..2149d168cf898 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -13,11 +13,11 @@ import numpy as np from pandas._libs import missing as libmissing -from pandas._libs.arrays import BitmaskArray from pandas.core.nanops import check_below_min_count if TYPE_CHECKING: + from pandas._libs.arrays import BitmaskArray from pandas._typing import AxisInt @@ -60,7 +60,7 @@ def _reductions( ): return libmissing.NA - return func(values, where=(~mask).to_numpy(), axis=axis, **kwargs) + return func(values, where=~mask, axis=axis, **kwargs) def sum( @@ -119,10 +119,7 @@ def _minmax( else: return func(values, axis=axis) else: - if isinstance(mask, BitmaskArray): - subset = values[(~mask).to_numpy()] - else: - subset = values[~mask] + subset = values[~mask] # type: ignore[index] if subset.size: return func(subset, axis=axis) else: diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index ee6f00b219a15..f6ea0b3fff3d6 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -10,6 +10,7 @@ ) if TYPE_CHECKING: + from pandas._libs.arrays import BitmaskArray from pandas._typing import ( ArrayLike, Scalar, @@ -43,7 +44,7 @@ def quantile_compat( def quantile_with_mask( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitmaskArray, fill_value, qs: npt.NDArray[np.float64], interpolation: str, @@ -80,7 +81,7 @@ def quantile_with_mask( if values.ndim == 1: # unsqueeze, operate, re-squeeze values = np.atleast_2d(values) - mask = np.atleast_2d(mask) + mask = np.atleast_2d(mask) # type: ignore[arg-type] res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation) return res_values[0] @@ -157,7 +158,7 @@ def _nanpercentile( qs: npt.NDArray[np.float64], *, na_value, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitmaskArray, interpolation: str, ): """ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index ae04f4a1174b6..9e5180b3cad4e 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -216,13 +216,13 @@ def pad_or_backfill( limit_area: Literal["inside", "outside"] | None = None, copy: bool = True, ) -> Self: - mask = self._mask.to_numpy() + mask = self._mask if mask.any(): func = missing.get_fill_func(method, ndim=self.ndim) npvalues = self._data.T - new_mask = mask.T + new_mask = mask.to_numpy().T if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() @@ -244,7 +244,7 @@ def fillna( ) -> Self: value, method = validate_fillna_kwargs(value, method) - mask = self._mask.to_numpy() + mask = self._mask value = missing.check_value_size(value, mask, len(self)) @@ -252,7 +252,7 @@ def fillna( if method is not None: func = missing.get_fill_func(method, ndim=self.ndim) npvalues = self._data.T - new_mask = mask.T + new_mask = mask.to_numpy().T if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() @@ -368,7 +368,7 @@ def swapaxes(self, axis1, axis2) -> Self: def delete(self, loc, axis: AxisInt = 0) -> Self: data = np.delete(self._data, loc, axis=axis) - mask = np.delete(self._mask.to_numpy(), loc, axis=axis) + mask = np.delete(self._mask, loc, axis=axis) # type: ignore[call-overload] return self._simple_new(data, mask) def reshape(self, *args, **kwargs) -> Self: @@ -414,7 +414,7 @@ def round(self, decimals: int = 0, *args, **kwargs): values = np.round(self._data, decimals=decimals, **kwargs) # Usually we'll get same type as self, but ndarray[bool] casts to float - return self._maybe_mask_result(values, self._mask.to_numpy()) + return self._maybe_mask_result(values, self._mask) # ------------------------------------------------------------------ # Unary Methods @@ -520,7 +520,7 @@ def to_numpy( with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) data = self._data.astype(dtype) - data[self._mask.to_numpy()] = na_value + data[self._mask] = na_value else: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) @@ -563,9 +563,9 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: data = self._data.astype(dtype.numpy_dtype, copy=copy) # mask is copied depending on whether the data was copied, and # not directly depending on the `copy` keyword - mask = self._mask if data is self._data else self._mask.to_numpy() + mask = self._mask if data is self._data else self._mask.copy() cls = dtype.construct_array_type() - return cls(data, mask, copy=False) # type: ignore[arg-type] + return cls(data, mask, copy=False) if isinstance(dtype, ExtensionDtype): eacls = dtype.construct_array_type() @@ -637,7 +637,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): inputs2 = [] for x in inputs: if isinstance(x, BaseMaskedArray): - mask |= x._mask.to_numpy() + mask |= x._mask inputs2.append(x._data) else: inputs2.append(x) @@ -699,9 +699,7 @@ def _propagate_mask( self, mask: npt.NDArray[np.bool_] | BitmaskArray | None, other ) -> npt.NDArray[np.bool_]: if mask is None: - mask = ( - self._mask.to_numpy() - ) # TODO: need test for BooleanArray needing a copy + mask = self._mask.copy() # TODO: need test for BooleanArray needing a copy if other is libmissing.NA: # GH#45421 don't alter inplace mask = mask | True @@ -788,21 +786,21 @@ def _arith_method(self, other, op): if op_name == "pow": # 1 ** x is 1. - mask = np.where((self._data == 1) & (~self._mask).to_numpy(), False, mask) + mask = np.where((self._data == 1) & ~self._mask, False, mask) # x ** 0 is 1. if omask is not None: - mask = np.where((other == 0) & (~omask).to_numpy(), False, mask) + mask = np.where((other == 0) & ~omask, False, mask) elif other is not libmissing.NA: mask = np.where(other == 0, False, mask) elif op_name == "rpow": # 1 ** x is 1. if omask is not None: - mask = np.where((other == 1) & (~omask).to_numpy(), False, mask) + mask = np.where((other == 1) & ~omask, False, mask) elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. - mask = np.where((self._data == 0) & (~self._mask).to_numpy(), False, mask) + mask = np.where((self._data == 0) & ~self._mask, False, mask) return self._maybe_mask_result(result, mask) @@ -814,7 +812,7 @@ def _cmp_method(self, other, op) -> BooleanArray: mask = None if isinstance(other, BaseMaskedArray): - other, mask = other._data, other._mask.to_numpy() + other, mask = other._data, other._mask elif is_list_like(other): other = np.asarray(other) @@ -849,7 +847,9 @@ def _cmp_method(self, other, op) -> BooleanArray: return BooleanArray(result, mask, copy=False) def _maybe_mask_result( - self, result: np.ndarray | tuple[np.ndarray, np.ndarray], mask: np.ndarray + self, + result: np.ndarray | tuple[np.ndarray, np.ndarray], + mask: np.ndarray | BitmaskArray, ): """ Parameters @@ -979,7 +979,7 @@ def isin(self, values) -> BooleanArray: # type: ignore[override] # For now, NA does not propagate so set result according to presence of NA, # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion - result[self._mask.to_numpy()] = values_have_NA + result[self._mask] = values_have_NA mask = np.zeros(self._data.shape, dtype=bool) return BooleanArray(result, mask, copy=False) @@ -997,7 +997,7 @@ def unique(self) -> Self: ------- uniques : BaseMaskedArray """ - uniques, mask = algos.unique_with_mask(self._data, self._mask.to_numpy()) + uniques, mask = algos.unique_with_mask(self._data, self._mask) return self._simple_new(uniques, mask) @doc(ExtensionArray.searchsorted) @@ -1023,7 +1023,7 @@ def factorize( use_na_sentinel: bool = True, ) -> tuple[np.ndarray, ExtensionArray]: arr = self._data - mask = self._mask.to_numpy() + mask = self._mask # Use a sentinel for na; recode and add NA to uniques if necessary below codes, uniques = factorize_array(arr, use_na_sentinel=True, mask=mask) @@ -1039,7 +1039,7 @@ def factorize( size = len(uniques) + 1 uniques_mask = np.zeros(size, dtype=bool) if not use_na_sentinel and has_na: - na_index = mask.argmax() + na_index = mask.to_numpy().argmax() # Insert na with the proper code if na_index == 0: na_code = np.intp(0) @@ -1082,7 +1082,7 @@ def value_counts(self, dropna: bool = True) -> Series: from pandas.arrays import IntegerArray keys, value_counts = algos.value_counts_arraylike( - self._data, dropna=True, mask=self._mask.to_numpy() + self._data, dropna=True, mask=self._mask ) if dropna: @@ -1116,8 +1116,8 @@ def equals(self, other) -> bool: if not np.array_equal(self._mask.to_numpy(), other._mask.to_numpy()): return False - left = self._data[(~self._mask).to_numpy()] - right = other._data[(~other._mask).to_numpy()] + left = self._data[~self._mask] # type: ignore[call-overload] + right = other._data[~other._mask] return array_equivalent(left, right, strict_nan=True, dtype_equal=True) def _quantile( @@ -1133,7 +1133,7 @@ def _quantile( """ res = quantile_with_mask( self._data, - mask=self._mask.to_numpy(), + mask=self._mask, # TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype) # instead of np.nan fill_value=np.nan, @@ -1172,7 +1172,7 @@ def _reduce( else: # median, skew, kurt, sem data = self._data - mask = self._mask.to_numpy() + mask = self._mask op = getattr(nanops, f"nan{name}") axis = kwargs.pop("axis", None) result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs) @@ -1182,8 +1182,8 @@ def _reduce( return self._wrap_na_result(name=name, axis=0, mask_size=(1,)) else: result = result.reshape(1) - mask = np.zeros(1, dtype=bool) - return self._maybe_mask_result(result, mask) + np_mask = np.zeros(1, dtype=bool) + return self._maybe_mask_result(result, np_mask) if isna(result): return libmissing.NA @@ -1406,7 +1406,7 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" np.putmask( values, - self._mask.to_numpy(), + self._mask, # type: ignore[arg-type] self._falsey_value, # type: ignore[arg-type] ) else: @@ -1501,7 +1501,7 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" np.putmask( values, - self._mask.to_numpy(), + self._mask, # type: ignore[arg-type] self._truthy_value, # type: ignore[arg-type] ) else: @@ -1547,9 +1547,9 @@ def _groupby_op( op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) # libgroupby functions are responsible for NOT altering mask - mask = self._mask.to_numpy() + mask = self._mask if op.kind != "aggregate": - result_mask = mask.copy() + result_mask = mask.to_numpy() else: result_mask = np.zeros(ngroups, dtype=bool) @@ -1558,7 +1558,7 @@ def _groupby_op( min_count=min_count, ngroups=ngroups, comp_ids=ids, - mask=mask, + mask=mask.to_numpy(), result_mask=result_mask, **kwargs, ) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8c1d5075480b6..2692ae29fe9d1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -143,6 +143,8 @@ class providing the base-class of operations. if TYPE_CHECKING: from typing import Any + from pandas._libs.arrays import BitmaskArray + from pandas.core.window import ( ExpandingGroupby, ExponentialMovingWindowGroupby, @@ -4387,8 +4389,9 @@ def post_processor( def blk_func(values: ArrayLike) -> ArrayLike: orig_vals = values + mask: np.ndarray | BitmaskArray if isinstance(values, BaseMaskedArray): - mask = values._mask.to_numpy() + mask = values._mask result_mask = np.zeros((ngroups, nqs), dtype=np.bool_) else: mask = isna(values) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 58b0e2907b8ce..c3b258481a2f1 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -46,10 +46,12 @@ ) if TYPE_CHECKING: + from pandas._libs.arrays import BitmaskArray + from pandas import Index -def check_value_size(value, mask: npt.NDArray[np.bool_], length: int): +def check_value_size(value, mask: npt.NDArray[np.bool_] | BitmaskArray, length: int): """ Validate the size of the values passed to ExtensionArray.fillna. """ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 78ba95e959042..140a3024a8684 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2456,8 +2456,8 @@ def _factorize_keys( if isinstance(lk, BaseMaskedArray): assert isinstance(rk, BaseMaskedArray) - llab = rizer.factorize(lk._data, mask=lk._mask.to_numpy()) - rlab = rizer.factorize(rk._data, mask=rk._mask.to_numpy()) + llab = rizer.factorize(lk._data, mask=lk._mask) + rlab = rizer.factorize(rk._data, mask=rk._mask) elif isinstance(lk, ArrowExtensionArray): assert isinstance(rk, ArrowExtensionArray) # we can only get here with numeric dtypes diff --git a/pandas/tests/arrays/boolean/test_logical.py b/pandas/tests/arrays/boolean/test_logical.py index 4cdaf3a90b21d..66c117ea3fc66 100644 --- a/pandas/tests/arrays/boolean/test_logical.py +++ b/pandas/tests/arrays/boolean/test_logical.py @@ -238,8 +238,8 @@ def test_no_masked_assumptions(self, other, all_logical_operators): tm.assert_extension_array_equal(result, expected) if isinstance(other, BooleanArray): - other._data[other._mask.to_numpy()] = True - a._data[a._mask.to_numpy()] = False + other._data[other._mask] = True + a._data[a._mask] = False result = getattr(a, all_logical_operators)(other) expected = getattr(b, all_logical_operators)(other) diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py index 746a64c626aef..a7b80c9a2c0c5 100644 --- a/pandas/tests/arrays/masked/test_bitmask.py +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -12,18 +12,22 @@ @pytest.mark.parametrize( "array,expected", [ - (np.array([False, False]), bytes([0x0])), - (np.array([True, False]), bytes([0x1])), - (np.array([False, True]), bytes([0x2])), - (np.array([True, True]), bytes([0x3])), - (np.array([True, False] * 8), bytes([0x55, 0x55])), + pytest.param(np.array([False, False]), bytes([0x0]), id="all_false"), + pytest.param(np.array([True, False]), bytes([0x1]), id="first_true"), + pytest.param(np.array([False, True]), bytes([0x2]), id="second_true"), + pytest.param(np.array([True, True]), bytes([0x3]), id="all_true"), + pytest.param(np.array([True, False] * 8), bytes([0x55, 0x55]), id="multibyte"), + pytest.param( + np.array([[False, False], [True, True], [False, False]])[:, 0], + [False, True, False], + id="non-contiguous", + ), ], ) def test_constructor_ndarray(array, expected): bma = BitmaskArray(array) assert bma.bytes == expected assert not bma.parent - assert bma.array_shape == array.shape @pytest.mark.parametrize( @@ -40,7 +44,6 @@ def test_constructor_bitmap(parent, expected): bma = BitmaskArray(parent) assert bma.bytes == expected assert bma.parent is parent - assert bma.array_shape == parent.shape def test_len(): @@ -52,7 +55,6 @@ def test_repr_no_parent(): bma = BitmaskArray(np.array([True, False, False])) result = repr(bma) assert "parent: None" in result - assert "shape: (3,)" in result assert "data: b'\\x01'" in result @@ -60,9 +62,8 @@ def test_repr_parent(): parent = BitmaskArray(np.array([False, False, True])) bma = BitmaskArray(parent) result = repr(bma) - parent_id = hex(id(parent)) - assert f"parent: > 1) & 0x1) == 0 - result4 = bma.take_1d(np.array([0, 0]), axis=0) + result4 = bma.take_1d(np.array([0, 0], dtype=np.int64), axis=0) assert (result4.bytes[0] & 0x1) == 1 assert ((result4.bytes[0] >> 1) & 0x1) == 1 - result5 = bma.take_1d(np.array([3, 2, 1, 0]), axis=0) + result5 = bma.take_1d(np.array([3, 2, 1, 0], dtype=np.int64), axis=0) assert (result5.bytes[0] & 0x1) == 0 assert ((result5.bytes[0] >> 1) & 0x1) == 1 assert ((result5.bytes[0] >> 2) & 0x1) == 0 @@ -419,3 +419,24 @@ def test_to_numpy(data): result = bma.to_numpy() tm.assert_numpy_array_equal(result, data) + + +@pytest.mark.parametrize( + "array,expected", + [ + pytest.param(np.array([False, False]), [False, False], id="all_false"), + pytest.param(np.array([True, False]), [True, False], id="first_true"), + pytest.param(np.array([False, True]), [False, True], id="second_true"), + pytest.param(np.array([True, True]), [True, True], id="all_true"), + pytest.param(np.array([True, False] * 8), [True, False] * 8, id="multibyte"), + pytest.param( + np.array([[False, False], [True, True], [False, False]])[:, 0], + [False, True, False], + id="non-contiguous", + ), + ], +) +def test_memoryview(array, expected): + bma = BitmaskArray(array) + vw = memoryview(bma) + assert vw.tolist() == expected diff --git a/pandas/tests/arrays/masked_shared.py b/pandas/tests/arrays/masked_shared.py index 78726b2a90471..2f025d039389d 100644 --- a/pandas/tests/arrays/masked_shared.py +++ b/pandas/tests/arrays/masked_shared.py @@ -16,7 +16,7 @@ def _compare_other(self, data, op, other): expected = pd.Series(op(data._data, other), dtype="boolean") # fill the nan locations - expected[data._mask.to_numpy()] = pd.NA + expected[data._mask.to_numpy()] = pd.NA # TODO: have series accept memview tm.assert_series_equal(result, expected) From a0d538a4a8ee835adbd26455b9ee3443aaff2c63 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 09:57:50 -0400 Subject: [PATCH 103/126] fixups --- pandas/_libs/arrays.pyx | 2 +- pandas/core/arrays/masked.py | 10 +--------- pandas/tests/arrays/masked/test_bitmask.py | 2 +- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index df1de3cfb035f..010629d257547 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -469,7 +469,7 @@ cdef class BitmaskArray: def __getitem__(self, key): cdef Py_ssize_t ckey # to_numpy can be expensive, so try to avoid for simple cases - if isinstance(key, int): + if isinstance(key, int) and self.ndim == 1: ckey = key if ckey >= 0 and ckey < self.bitmap.size_bits: return bool(ArrowBitGet(self.bitmap.buffer.data, ckey)) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9e5180b3cad4e..445986e642828 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -191,21 +191,13 @@ def __getitem__(self, item: SequenceIndexer) -> Self: def __getitem__(self, item: PositionalIndexer) -> Self | Any: item = check_array_indexer(self, item) - # TODO: some of the numpy semantics for handling 2D indexing - # are not implemented in the bitmaskarray, hence the to_numpy() - # requirement, though that slows things down - np_mask = self._mask.to_numpy() - newmask = np_mask[item] + newmask = self._mask[item] if is_bool(newmask): # This is a scalar indexing if newmask: return self.dtype.na_value return self._data[item] - # sending self._mask avoids copy of buffer - if np.array_equal(newmask, np_mask): - return self._simple_new(self._data[item], self._mask) - return self._simple_new(self._data[item], newmask) def pad_or_backfill( diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py index a7b80c9a2c0c5..42b471508864f 100644 --- a/pandas/tests/arrays/masked/test_bitmask.py +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -19,7 +19,7 @@ pytest.param(np.array([True, False] * 8), bytes([0x55, 0x55]), id="multibyte"), pytest.param( np.array([[False, False], [True, True], [False, False]])[:, 0], - [False, True, False], + bytes([0x2]), id="non-contiguous", ), ], From 9a97677f329a7a76e602867eb1f07950d435fa5c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 10:52:12 -0400 Subject: [PATCH 104/126] getitem fastpath for slice --- pandas/_libs/arrays.pyx | 28 ++++++++++++++++++++++ pandas/tests/arrays/masked/test_bitmask.py | 12 ++++++++++ 2 files changed, 40 insertions(+) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 010629d257547..41363cbe666fc 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -7,6 +7,7 @@ import numpy as np cimport numpy as cnp from cpython cimport PyErr_Clear +from cpython.slice cimport PySlice_Unpack from libc.stdlib cimport ( free, malloc, @@ -468,6 +469,11 @@ cdef class BitmaskArray: def __getitem__(self, key): cdef Py_ssize_t ckey + cdef Py_ssize_t start, stop, step + cdef BitmaskArray bma + cdef ArrowBitmap bitmap + cdef int64_t nbytes + cdef BitmaskArray self_ = self # to_numpy can be expensive, so try to avoid for simple cases if isinstance(key, int) and self.ndim == 1: ckey = key @@ -475,6 +481,28 @@ cdef class BitmaskArray: return bool(ArrowBitGet(self.bitmap.buffer.data, ckey)) elif is_null_slice(key): return self.copy() + elif isinstance(key, slice): + # fastpath for slices that start at 0 and step 1 at a time + # towards a positive number. + # TODO: upstream generic ArrowBitsGet function in nanoarrow + PySlice_Unpack(key, &start, &stop, &step) + if start == 0 and stop > 0 and step == 1: + bma = BitmaskArray.__new__(BitmaskArray) + ArrowBitmapInit(&bitmap) + nbytes = (stop + 7) // 8 + ArrowBitmapReserve(&bitmap, nbytes) + memcpy(bitmap.buffer.data, self_.bitmap.buffer.data, nbytes) + bitmap.buffer.size_bytes = nbytes + bitmap.size_bits = stop + + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides + bma.parent = False + + return bma return self.to_numpy()[key] diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py index 42b471508864f..351cae23384ae 100644 --- a/pandas/tests/arrays/masked/test_bitmask.py +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -114,12 +114,24 @@ def test_getitem_null_slice(): result = bma[:] assert not result.parent + assert len(result) == 3 assert result.bytes[0] & 1 == 1 assert (result.bytes[0] >> 1) & 1 == 0 assert (result.bytes[0] >> 2) & 1 == 1 +def test_getitem_monotonic_slice(): + bma = BitmaskArray(np.array([True, False, True])) + result = bma[slice(2)] + + assert not result.parent + assert len(result) == 2 + + assert result.bytes[0] & 1 == 1 + assert (result.bytes[0] >> 1) & 1 == 0 + + @pytest.mark.parametrize( "indexer,expected", [ From 96f080d94f24b4ec5931b91823fa7525f146a5e7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 10:59:05 -0400 Subject: [PATCH 105/126] mypy fix --- pandas/core/arrays/masked.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 445986e642828..a4343970d50dc 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -198,6 +198,7 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self.dtype.na_value return self._data[item] + assert not isinstance(newmask, bool) # for mypy return self._simple_new(self._data[item], newmask) def pad_or_backfill( From e35b769900eef034b8586cd3ae43c9b6bda414cc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 12:30:08 -0400 Subject: [PATCH 106/126] fix OOB memcpy --- pandas/_libs/arrays.pyx | 5 ++++- pandas/tests/arrays/masked/test_bitmask.py | 18 +++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 41363cbe666fc..aa65148d5f7cc 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -487,9 +487,12 @@ cdef class BitmaskArray: # TODO: upstream generic ArrowBitsGet function in nanoarrow PySlice_Unpack(key, &start, &stop, &step) if start == 0 and stop > 0 and step == 1: + nbytes = (stop + 7) // 8 + if nbytes > self_.bitmap.size_bits: + nbytes = self_.bitmap.size_bits + bma = BitmaskArray.__new__(BitmaskArray) ArrowBitmapInit(&bitmap) - nbytes = (stop + 7) // 8 ArrowBitmapReserve(&bitmap, nbytes) memcpy(bitmap.buffer.data, self_.bitmap.buffer.data, nbytes) bitmap.buffer.size_bytes = nbytes diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py index 351cae23384ae..ddbb9d87a9a2e 100644 --- a/pandas/tests/arrays/masked/test_bitmask.py +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -121,15 +121,23 @@ def test_getitem_null_slice(): assert (result.bytes[0] >> 2) & 1 == 1 -def test_getitem_monotonic_slice(): +@pytest.mark.parametrize( + "indexer,mask,expected", + [ + pytest.param(slice(2), bytes([0x3]), bytes([0x1]), id="basic_slice"), + pytest.param( + slice(1000), bytes([0x7]), bytes([0x05]), id="slice_exceeding_bounds" + ), + ], +) +def test_getitem_monotonic_slice(indexer, mask, expected): bma = BitmaskArray(np.array([True, False, True])) - result = bma[slice(2)] + result = bma[indexer] assert not result.parent - assert len(result) == 2 - assert result.bytes[0] & 1 == 1 - assert (result.bytes[0] >> 1) & 1 == 0 + # the bits past the length of result are undefined, so explicitly mask them out + assert (result.bytes[0] & mask[0]) == expected[0] @pytest.mark.parametrize( From 8149e03400438266f2266b235d578f095e8fb169 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 14:33:06 -0400 Subject: [PATCH 107/126] fix slicing issue with memview --- pandas/_libs/arrays.pyx | 9 +++++---- pandas/tests/arrays/masked/test_bitmask.py | 7 +++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index aa65148d5f7cc..15ebecf55be6f 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -481,15 +481,16 @@ cdef class BitmaskArray: return bool(ArrowBitGet(self.bitmap.buffer.data, ckey)) elif is_null_slice(key): return self.copy() - elif isinstance(key, slice): + elif isinstance(key, slice) and self.ndim == 1: # fastpath for slices that start at 0 and step 1 at a time # towards a positive number. # TODO: upstream generic ArrowBitsGet function in nanoarrow PySlice_Unpack(key, &start, &stop, &step) if start == 0 and stop > 0 and step == 1: + if stop > self_.bitmap.size_bits: + stop = self_.bitmap.size_bits + nbytes = (stop + 7) // 8 - if nbytes > self_.bitmap.size_bits: - nbytes = self_.bitmap.size_bits bma = BitmaskArray.__new__(BitmaskArray) ArrowBitmapInit(&bitmap) @@ -501,7 +502,7 @@ cdef class BitmaskArray: bma.bitmap = bitmap bma.buffer_owner = True bma.ndim = self_.ndim - bma.shape = self_.shape + bma.shape[0] = stop bma.strides = self_.strides bma.parent = False diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py index ddbb9d87a9a2e..95d8f1d233e1d 100644 --- a/pandas/tests/arrays/masked/test_bitmask.py +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -460,3 +460,10 @@ def test_memoryview(array, expected): bma = BitmaskArray(array) vw = memoryview(bma) assert vw.tolist() == expected + + +def test_bitmask_array_shape_from_sliced_bitmask(): + orig_bma = BitmaskArray([True] * 100) + bma = BitmaskArray(orig_bma[:10]) + + assert bma.shape == (10,) From 202de07da3a34c146523165834192ff252f89872 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 15:03:21 -0400 Subject: [PATCH 108/126] fixups --- pandas/_libs/arrays.pyx | 10 ++++++---- pandas/tests/arrays/masked/test_bitmask.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 15ebecf55be6f..dcf0dceaf13f1 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -472,7 +472,7 @@ cdef class BitmaskArray: cdef Py_ssize_t start, stop, step cdef BitmaskArray bma cdef ArrowBitmap bitmap - cdef int64_t nbytes + cdef int64_t nbytes, nbits cdef BitmaskArray self_ = self # to_numpy can be expensive, so try to avoid for simple cases if isinstance(key, int) and self.ndim == 1: @@ -488,13 +488,15 @@ cdef class BitmaskArray: PySlice_Unpack(key, &start, &stop, &step) if start == 0 and stop > 0 and step == 1: if stop > self_.bitmap.size_bits: - stop = self_.bitmap.size_bits + nbits = self_.bitmap.size_bits + else: + nbits = stop nbytes = (stop + 7) // 8 bma = BitmaskArray.__new__(BitmaskArray) ArrowBitmapInit(&bitmap) - ArrowBitmapReserve(&bitmap, nbytes) + ArrowBitmapReserve(&bitmap, nbits) memcpy(bitmap.buffer.data, self_.bitmap.buffer.data, nbytes) bitmap.buffer.size_bytes = nbytes bitmap.size_bits = stop @@ -502,7 +504,7 @@ cdef class BitmaskArray: bma.bitmap = bitmap bma.buffer_owner = True bma.ndim = self_.ndim - bma.shape[0] = stop + bma.shape[0] = nbits bma.strides = self_.strides bma.parent = False diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py index 95d8f1d233e1d..66e31eca67d53 100644 --- a/pandas/tests/arrays/masked/test_bitmask.py +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -463,7 +463,7 @@ def test_memoryview(array, expected): def test_bitmask_array_shape_from_sliced_bitmask(): - orig_bma = BitmaskArray([True] * 100) + orig_bma = BitmaskArray(np.array([True] * 100)) bma = BitmaskArray(orig_bma[:10]) assert bma.shape == (10,) From 73f438c7094bed3312e4f29fb3d8ff19b79867e7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 17:11:34 -0400 Subject: [PATCH 109/126] fixed memory issues with getitem fastpath --- pandas/_libs/arrays.pyx | 46 ++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index dcf0dceaf13f1..10f73cd7a3fa0 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -397,6 +397,7 @@ cdef class BitmaskArray: ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, total_bits) + ConcatenateBitmapData(bitmaps, nbitmaps, &bitmap) free(bitmaps) @@ -492,14 +493,14 @@ cdef class BitmaskArray: else: nbits = stop - nbytes = (stop + 7) // 8 + nbytes = (nbits + 7) // 8 bma = BitmaskArray.__new__(BitmaskArray) ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, nbits) memcpy(bitmap.buffer.data, self_.bitmap.buffer.data, nbytes) bitmap.buffer.size_bytes = nbytes - bitmap.size_bits = stop + bitmap.size_bits = nbits bma.bitmap = bitmap bma.buffer_owner = True @@ -557,10 +558,11 @@ cdef class BitmaskArray: BitmapAnd(&self_.bitmap, &other_bma.bitmap, &bitmap) result = np.empty(self_.bitmap.size_bits, dtype=bool) - BitmaskArray.buffer_to_array_1d( - result, + ArrowBitsUnpackInt8( bitmap.buffer.data, - bitmap.size_bits + 0, + bitmap.size_bits, + &result[0] ) ArrowBitmapReset(&bitmap) @@ -592,10 +594,11 @@ cdef class BitmaskArray: BitmapOr(&self_.bitmap, &other_bma.bitmap, &bitmap) result = np.empty(self_.bitmap.size_bits, dtype=bool) - BitmaskArray.buffer_to_array_1d( - result, + ArrowBitsUnpackInt8( bitmap.buffer.data, - bitmap.size_bits + 0, + bitmap.size_bits, + &result[0] ) ArrowBitmapReset(&bitmap) @@ -627,10 +630,11 @@ cdef class BitmaskArray: BitmapXor(&self_.bitmap, &other_bma.bitmap, &bitmap) result = np.empty(self_.bitmap.size_bits, dtype=bool) - BitmaskArray.buffer_to_array_1d( - result, + ArrowBitsUnpackInt8( bitmap.buffer.data, - bitmap.size_bits + 0, + bitmap.size_bits, + &result[0] ) ArrowBitmapReset(&bitmap) if self_.ndim == 2: @@ -737,7 +741,7 @@ cdef class BitmaskArray: self_.bitmap.buffer.data, 0, self_.bitmap.size_bits, - self_.memview_buf + self_.memview_buf ) buffer.buf = self_.memview_buf @@ -840,19 +844,15 @@ cdef class BitmaskArray: def copy(self): return BitmaskArray.copy_from_bitmaskarray(self) - @cython.boundscheck(False) # TODO: Removing this causes an IndexError? Zero size? - @cython.wraparound(False) - @staticmethod - cdef void buffer_to_array_1d(uint8_t[:] out, const uint8_t* buf, Py_ssize_t size): - ArrowBitsUnpackInt8(buf, 0, size, &out[0]) - def to_numpy(self) -> ndarray: cdef BitmaskArray self_ = self - cdef ndarray[uint8_t] result = np.empty(self.bitmap.size_bits, dtype=bool) - BitmaskArray.buffer_to_array_1d( - result, - self.bitmap.buffer.data, - self.bitmap.size_bits + cdef ndarray[uint8_t] result = np.empty(self_.bitmap.size_bits, dtype=bool) + + ArrowBitsUnpackInt8( + self_.bitmap.buffer.data, + 0, + self_.bitmap.size_bits, + cnp.PyArray_BYTES(result), ) if self_.ndim == 2: From e09743f1fc00d7865aecafcf996a234ad649acea Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 18:14:09 -0400 Subject: [PATCH 110/126] fix copy --- pandas/_libs/arrays.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 10f73cd7a3fa0..ddc708af2411f 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -481,7 +481,7 @@ cdef class BitmaskArray: if ckey >= 0 and ckey < self.bitmap.size_bits: return bool(ArrowBitGet(self.bitmap.buffer.data, ckey)) elif is_null_slice(key): - return self.copy() + return self elif isinstance(key, slice) and self.ndim == 1: # fastpath for slices that start at 0 and step 1 at a time # towards a positive number. From 3303be760bcb31dce9f92355fc705a4c77f4bbb4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 29 Aug 2023 14:07:53 -0400 Subject: [PATCH 111/126] win/32bit support --- pandas/tests/arrays/masked/test_bitmask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py index 66e31eca67d53..704c22b73e182 100644 --- a/pandas/tests/arrays/masked/test_bitmask.py +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -401,7 +401,7 @@ def test_take1d(): def test_take1d_raises_not_axis0(): bma = BitmaskArray(np.array([True, False, True])) with pytest.raises(NotImplementedError, match="only implemented for axis=0"): - bma.take_1d(np.array([1]), axis=1) + bma.take_1d(np.array([1], dtype=np.int64), axis=1) def test_take_1d_raises_empty_indices(): From 29873e442cc11d1a37a652036ef452a86c52df97 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 29 Aug 2023 17:25:03 -0400 Subject: [PATCH 112/126] NumPy compat --- pandas/tests/frame/indexing/test_where.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 3d3df2d714ca4..3cc0f48613f08 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gte1p24 + from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -702,6 +704,11 @@ def test_where_categorical_filtering(self): tm.assert_equal(result, expected) + @pytest.mark.xfail( + not np_version_gte1p24, + reason="Changed NumPy behavior for >1D non-tuple sequence indexing", + strict=False, + ) def test_where_ea_other(self): # GH#38729/GH#38742 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) From 173b4cbe53ec787ee31408d5132623e93a1527a0 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 29 Aug 2023 18:42:14 -0400 Subject: [PATCH 113/126] test restructure --- pandas/tests/arrays/masked/test_bitmask.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py index 704c22b73e182..9141587d27171 100644 --- a/pandas/tests/arrays/masked/test_bitmask.py +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -10,7 +10,7 @@ @pytest.mark.parametrize( - "array,expected", + "data,expected", [ pytest.param(np.array([False, False]), bytes([0x0]), id="all_false"), pytest.param(np.array([True, False]), bytes([0x1]), id="first_true"), @@ -24,23 +24,24 @@ ), ], ) -def test_constructor_ndarray(array, expected): - bma = BitmaskArray(array) +def test_constructor_ndarray(data, expected): + bma = BitmaskArray(data) assert bma.bytes == expected assert not bma.parent @pytest.mark.parametrize( - "parent,expected", + "data,expected", [ - (BitmaskArray(np.array([False, False])), bytes([0x0])), - (BitmaskArray(np.array([True, False])), bytes([0x1])), - (BitmaskArray(np.array([False, True])), bytes([0x2])), - (BitmaskArray(np.array([True, True])), bytes([0x3])), - (BitmaskArray(np.array([True, False] * 8)), bytes([0x55, 0x55])), + (np.array([False, False]), bytes([0x0])), + (np.array([True, False]), bytes([0x1])), + (np.array([False, True]), bytes([0x2])), + (np.array([True, True]), bytes([0x3])), + (np.array([True, False] * 8), bytes([0x55, 0x55])), ], ) -def test_constructor_bitmap(parent, expected): +def test_constructor_bitmap(data, expected): + parent = BitmaskArray(data) bma = BitmaskArray(parent) assert bma.bytes == expected assert bma.parent is parent From a1278a94cada4aca1dab6fdc165532ce515d5c5a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 30 Aug 2023 12:22:06 -0400 Subject: [PATCH 114/126] more performance --- pandas/_libs/arrays.pyx | 181 ++++++++++-------- .../_libs/include/pandas/bitmask_algorithms.h | 12 ++ pandas/_libs/src/bitmask_algorithms.c | 178 +++++++++++++++-- pandas/tests/arrays/masked/test_bitmask.py | 161 +++++++++++++--- 4 files changed, 406 insertions(+), 126 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index ddc708af2411f..ad421e06b1864 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -53,6 +53,9 @@ cdef extern from "pandas/bitmask_algorithms.h": bint BitmapOr(const ArrowBitmap*, const ArrowBitmap*, ArrowBitmap*) bint BitmapXor(const ArrowBitmap*, const ArrowBitmap*, ArrowBitmap*) bint BitmapAnd(const ArrowBitmap*, const ArrowBitmap*, ArrowBitmap*) + bint BitmapOrBool(const ArrowBitmap*, bint, ArrowBitmap*) + bint BitmapXorBool(const ArrowBitmap*, bint, ArrowBitmap*) + bint BitmapAndBool(const ArrowBitmap*, bint, ArrowBitmap*) bint BitmapInvert(const ArrowBitmap*, ArrowBitmap*) bint BitmapTake(const ArrowBitmap*, const int64_t*, size_t, ArrowBitmap*) bint BitmapPutFromBufferMask(ArrowBitmap*, const uint8_t*, size_t, uint8_t) @@ -253,7 +256,7 @@ cdef class BitmaskArray: @cython.boundscheck(False) @cython.wraparound(False) - cdef void init_from_ndarray(self, const uint8_t[::1] arr): + cdef void init_from_ndarray(self, const uint8_t[::1] arr) noexcept: cdef ArrowBitmap bitmap # As long as we have a 1D arr argument we can use .shape[0] to avoid # a call to Python via .size @@ -264,7 +267,7 @@ cdef class BitmaskArray: self.bitmap = bitmap self.buffer_owner = True - cdef void init_from_bitmaskarray(self, BitmaskArray bma): + cdef void init_from_bitmaskarray(self, BitmaskArray bma) noexcept: self.bitmap = bma.bitmap self.buffer_owner = False self.ndim = bma.ndim @@ -274,27 +277,22 @@ cdef class BitmaskArray: self.shape[1] = bma.shape[1] self.strides[1] = bma.strides[1] - def __cinit__(self): - cdef BitmaskArray self_ = self - self.parent = False - self_.n_consumers = 0 - self_.memview_buf = NULL - def __init__(self, data): - cdef BitmaskArray self_ = self - if isinstance(data, np.ndarray): - if not data.flags["C_CONTIGUOUS"]: - data = np.ascontiguousarray(data) - - self.init_from_ndarray(data.ravel()) + cdef ndarray arr + if cnp.PyArray_Check(data): + arr = data + if not cnp.PyArray_IS_C_CONTIGUOUS(arr): + arr = cnp.PyArray_GETCONTIGUOUS(arr) + + self.init_from_ndarray(arr.ravel()) + self.ndim = arr.ndim + self.shape[0] = arr.shape[0] + self.strides[0] = arr.strides[0] + if self.ndim == 2: + self.shape[1] = arr.shape[1] + self.strides[1] = arr.strides[1] self.parent = None - self_.ndim = data.ndim - self_.shape[0] = data.shape[0] - self_.strides[0] = data.strides[0] - if (data.ndim == 2): - self_.shape[1] = data.shape[1] - self_.strides[1] = data.strides[1] - elif isinstance(data, type(self)): + elif isinstance(data, BitmaskArray): self.init_from_bitmaskarray(data) self.parent = data else: @@ -475,11 +473,13 @@ cdef class BitmaskArray: cdef ArrowBitmap bitmap cdef int64_t nbytes, nbits cdef BitmaskArray self_ = self + cdef bint result # to_numpy can be expensive, so try to avoid for simple cases if isinstance(key, int) and self.ndim == 1: ckey = key if ckey >= 0 and ckey < self.bitmap.size_bits: - return bool(ArrowBitGet(self.bitmap.buffer.data, ckey)) + result = ArrowBitGet(self.bitmap.buffer.data, ckey) + return result elif is_null_slice(key): return self elif isinstance(key, slice) and self.ndim == 1: @@ -537,19 +537,14 @@ cdef class BitmaskArray: return bma def __and__(self, other): - cdef ndarray[uint8_t] result cdef BitmaskArray other_bma, self_ = self # self_ required for Cython < 3 + cdef BitmaskArray bma cdef ArrowBitmap bitmap + cdef bint bval - if isinstance(other, type(self)): + if isinstance(other, BitmaskArray): # TODO: maybe should return Self here instead of ndarray - other_bma = other - if self_.bitmap.size_bits == 0: - result = np.empty([], dtype=bool) - if self_.ndim == 2: - return result.reshape(self_.shape[0], self_.shape[1]) - return result - + other_bma = other if self_.bitmap.size_bits != other_bma.bitmap.size_bits: raise ValueError("bitmaps are not equal size") @@ -557,29 +552,40 @@ cdef class BitmaskArray: ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) BitmapAnd(&self_.bitmap, &other_bma.bitmap, &bitmap) - result = np.empty(self_.bitmap.size_bits, dtype=bool) - ArrowBitsUnpackInt8( - bitmap.buffer.data, - 0, - bitmap.size_bits, - &result[0] - ) - ArrowBitmapReset(&bitmap) + bma = BitmaskArray.__new__(BitmaskArray) + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides + + return bma + elif isinstance(other, bool): + bval = other + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + BitmapAndBool(&self_.bitmap, bval, &bitmap) + + bma = BitmaskArray.__new__(BitmaskArray) + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides - if self_.ndim == 2: - return result.reshape(self_.shape[0], self_.shape[1]) - return result + return bma return self.to_numpy() & other def __or__(self, other): cdef ndarray[uint8_t] result cdef BitmaskArray other_bma, self_ = self # self_ required for Cython < 3 + cdef BitmaskArray bma cdef ArrowBitmap bitmap + cdef bint bval - if isinstance(other, type(self)): - # TODO: maybe should return Self here instead of ndarray - other_bma = other + if isinstance(other, BitmaskArray): + other_bma = other if self_.bitmap.size_bits == 0: result = np.empty([], dtype=bool) if self_.ndim == 2: @@ -593,35 +599,40 @@ cdef class BitmaskArray: ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) BitmapOr(&self_.bitmap, &other_bma.bitmap, &bitmap) - result = np.empty(self_.bitmap.size_bits, dtype=bool) - ArrowBitsUnpackInt8( - bitmap.buffer.data, - 0, - bitmap.size_bits, - &result[0] - ) - ArrowBitmapReset(&bitmap) + bma = BitmaskArray.__new__(BitmaskArray) + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides + + return bma + elif isinstance(other, bool): + bval = other + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + BitmapOrBool(&self_.bitmap, bval, &bitmap) + + bma = BitmaskArray.__new__(BitmaskArray) + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides - if self_.ndim == 2: - return result.reshape(self_.shape[0], self_.shape[1]) - return result + return bma return self.to_numpy() | other def __xor__(self, other): - cdef ndarray[uint8_t] result cdef BitmaskArray other_bma, self_ = self # self_ required for Cython < 3 + cdef BitmaskArray bma cdef ArrowBitmap bitmap + cdef bint bval - if isinstance(other, type(self)): + if isinstance(other, BitmaskArray): # TODO: maybe should return Self here instead of ndarray - other_bma = other - if self_.bitmap.size_bits == 0: - result = np.empty([], dtype=bool) - if self_.ndim == 2: - return result.reshape(self_.shape[0], self_.shape[1]) - return result - + other_bma = other if self_.bitmap.size_bits != other_bma.bitmap.size_bits: raise ValueError("bitmaps are not equal size") @@ -629,17 +640,28 @@ cdef class BitmaskArray: ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) BitmapXor(&self_.bitmap, &other_bma.bitmap, &bitmap) - result = np.empty(self_.bitmap.size_bits, dtype=bool) - ArrowBitsUnpackInt8( - bitmap.buffer.data, - 0, - bitmap.size_bits, - &result[0] - ) - ArrowBitmapReset(&bitmap) - if self_.ndim == 2: - return result.reshape(self_.shape[0], self_.shape[1]) - return result + bma = BitmaskArray.__new__(BitmaskArray) + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides + + return bma + elif isinstance(other, bool): + bval = other + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + BitmapXorBool(&self_.bitmap, bval, &bitmap) + + bma = BitmaskArray.__new__(BitmaskArray) + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides + + return bma return self.to_numpy() ^ other @@ -729,8 +751,10 @@ cdef class BitmaskArray: def __iter__(self): cdef Py_ssize_t i cdef BitmaskArray self_ = self # self_ required for Cython < 3 + cdef bint result for i in range(self_.bitmap.size_bits): - yield bool(ArrowBitGet(self_.bitmap.buffer.data, i)) + result = ArrowBitGet(self_.bitmap.buffer.data, i) + yield result def __getbuffer__(self, Py_buffer *buffer, int flags): cdef BitmaskArray self_ = self @@ -795,7 +819,8 @@ cdef class BitmaskArray: return np.dtype("bool") def any(self) -> bool: - return BitmapAny(&self.bitmap) + cdef bint result = BitmapAny(&self.bitmap) + return result def all(self) -> bool: return BitmapAll(&self.bitmap) diff --git a/pandas/_libs/include/pandas/bitmask_algorithms.h b/pandas/_libs/include/pandas/bitmask_algorithms.h index f29fddf6551ed..8598b5ebdd568 100644 --- a/pandas/_libs/include/pandas/bitmask_algorithms.h +++ b/pandas/_libs/include/pandas/bitmask_algorithms.h @@ -23,14 +23,26 @@ bool BitmapAll(const struct ArrowBitmap *bitmap); int BitmapOr(const struct ArrowBitmap *bitmap1, const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out); +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapOrBool(const struct ArrowBitmap *bitmap1, bool, + struct ArrowBitmap *out); + /* Returns -1 on failure. On success returns 0 and writes to out */ int BitmapXor(const struct ArrowBitmap *bitmap1, const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out); +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapXorBool(const struct ArrowBitmap *bitmap1, bool, + struct ArrowBitmap *out); + /* Returns -1 on failure. On success returns 0 and writes to out */ int BitmapAnd(const struct ArrowBitmap *bitmap1, const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out); +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapAndBool(const struct ArrowBitmap *bitmap1, bool, + struct ArrowBitmap *out); + /* Returns -1 on failure. On success returns 0 and writes to out */ int BitmapInvert(const struct ArrowBitmap *bitmap, struct ArrowBitmap *out); diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index d78d8b973a789..14d27b206838b 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -69,23 +69,22 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, } bool BitmapAny(const struct ArrowBitmap *bitmap) { - const size_t nbits = bitmap->size_bits; - const size_t size_bytes = bitmap->buffer.size_bytes; - const uint8_t *buf = bitmap->buffer.data; - - if (nbits < 1) { + if (bitmap->size_bits < 1) { return false; } - for (size_t i = 0; i < size_bytes - 1; i++) { - if (buf[i] > 0) { + size_t i = 0; + for (; i + sizeof(size_t) - 1 < bitmap->buffer.size_bytes; + i += sizeof(size_t)) { + size_t value; + memcpy(&value, &bitmap->buffer.data[i], sizeof(size_t)); + if (value != 0x0) { return true; } } - const size_t bits_remaining = nbits - ((size_bytes - 1) * 8); - for (size_t i = 0; i < bits_remaining; i++) { - if (ArrowBitGet(buf, nbits - i - 1)) { + for (; i < bitmap->buffer.size_bytes; i++) { + if (bitmap->buffer.data[i] != 0x0) { return true; } } @@ -96,21 +95,29 @@ bool BitmapAny(const struct ArrowBitmap *bitmap) { bool BitmapAll(const struct ArrowBitmap *bitmap) { const size_t nbits = bitmap->size_bits; const size_t size_bytes = bitmap->buffer.size_bytes; - const uint8_t *buf = bitmap->buffer.data; - if (nbits < 1) { return true; } - for (size_t i = 0; i < size_bytes - 1; i++) { - if (buf[i] != 0xff) { + size_t i = 0; + for (; i + sizeof(size_t) - 1 < bitmap->buffer.size_bytes; + i += sizeof(size_t)) { + size_t value; + memcpy(&value, &bitmap->buffer.data[i], sizeof(size_t)); + if (value != SIZE_MAX) { + return false; + } + } + + for (; i < bitmap->buffer.size_bytes - 1; i++) { + if (bitmap->buffer.data[i] != 0xff) { return false; } } const size_t bits_remaining = nbits - ((size_bytes - 1) * 8); for (size_t i = 0; i < bits_remaining; i++) { - if (ArrowBitGet(buf, nbits - i - 1) == 0) { + if (ArrowBitGet(bitmap->buffer.data, nbits - i - 1) == 0) { return false; } } @@ -126,7 +133,19 @@ int BitmapOr(const struct ArrowBitmap *bitmap1, return -1; } - for (int64_t i = 0; i < bitmap1->buffer.size_bytes; i++) { + size_t i = 0; + for (; i + sizeof(size_t) - 1 < bitmap1->buffer.size_bytes; + i += sizeof(size_t)) { + size_t value1; + size_t value2; + size_t result; + memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); + memcpy(&value2, &bitmap2->buffer.data[i], sizeof(size_t)); + result = value1 | value2; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap1->buffer.size_bytes; i++) { out->buffer.data[i] = bitmap1->buffer.data[i] | bitmap2->buffer.data[i]; } @@ -136,6 +155,35 @@ int BitmapOr(const struct ArrowBitmap *bitmap1, return 0; } +int BitmapOrBool(const struct ArrowBitmap *bitmap1, bool other, + struct ArrowBitmap *out) { + if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + return -1; + } + + const size_t mask = other ? SIZE_MAX : 0; + const uint8_t umask = other ? UINT8_MAX : 0; + + size_t i = 0; + for (; i + sizeof(size_t) - 1 < bitmap1->buffer.size_bytes; + i += sizeof(size_t)) { + size_t value1; + size_t result; + memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); + result = value1 | mask; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = bitmap1->buffer.data[i] | umask; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} + int BitmapAnd(const struct ArrowBitmap *bitmap1, const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out) { if (bitmap1->size_bits != bitmap2->size_bits) { @@ -144,7 +192,19 @@ int BitmapAnd(const struct ArrowBitmap *bitmap1, return -1; } - for (int64_t i = 0; i < bitmap1->buffer.size_bytes; i++) { + size_t i = 0; + for (; i + sizeof(size_t) - 1 < bitmap1->buffer.size_bytes; + i += sizeof(size_t)) { + size_t value1; + size_t value2; + size_t result; + memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); + memcpy(&value2, &bitmap2->buffer.data[i], sizeof(size_t)); + result = value1 & value2; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap1->buffer.size_bytes; i++) { out->buffer.data[i] = bitmap1->buffer.data[i] & bitmap2->buffer.data[i]; } @@ -154,6 +214,35 @@ int BitmapAnd(const struct ArrowBitmap *bitmap1, return 0; } +int BitmapAndBool(const struct ArrowBitmap *bitmap1, bool other, + struct ArrowBitmap *out) { + if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + return -1; + } + + const size_t mask = other ? SIZE_MAX : 0; + const uint8_t umask = other ? UINT8_MAX : 0; + + size_t i = 0; + for (; i + sizeof(size_t) - 1 < bitmap1->buffer.size_bytes; + i += sizeof(size_t)) { + size_t value1; + size_t result; + memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); + result = value1 & mask; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = bitmap1->buffer.data[i] & umask; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} + int BitmapXor(const struct ArrowBitmap *bitmap1, const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out) { if (bitmap1->size_bits != bitmap2->size_bits) { @@ -162,7 +251,19 @@ int BitmapXor(const struct ArrowBitmap *bitmap1, return -1; } - for (int64_t i = 0; i < bitmap1->buffer.size_bytes; i++) { + size_t i = 0; + for (; i + sizeof(size_t) - 1 < bitmap1->buffer.size_bytes; + i += sizeof(size_t)) { + size_t value1; + size_t value2; + size_t result; + memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); + memcpy(&value2, &bitmap2->buffer.data[i], sizeof(size_t)); + result = value1 ^ value2; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap1->buffer.size_bytes; i++) { out->buffer.data[i] = bitmap1->buffer.data[i] ^ bitmap2->buffer.data[i]; } @@ -172,12 +273,51 @@ int BitmapXor(const struct ArrowBitmap *bitmap1, return 0; } +int BitmapXorBool(const struct ArrowBitmap *bitmap1, bool other, + struct ArrowBitmap *out) { + if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + return -1; + } + + const size_t mask = other ? SIZE_MAX : 0; + const uint8_t umask = other ? UINT8_MAX : 0; + + size_t i = 0; + for (; i + sizeof(size_t) - 1 < bitmap1->buffer.size_bytes; + i += sizeof(size_t)) { + size_t value1; + size_t result; + memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); + result = value1 ^ mask; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = bitmap1->buffer.data[i] ^ umask; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} + int BitmapInvert(const struct ArrowBitmap *bitmap, struct ArrowBitmap *out) { if (!(out->buffer.capacity_bytes >= bitmap->buffer.size_bytes)) { return -1; } - for (int64_t i = 0; i < bitmap->buffer.size_bytes; i++) { + size_t i = 0; + for (; i + sizeof(size_t) - 1 < bitmap->buffer.size_bytes; + i += sizeof(size_t)) { + size_t value; + size_t result; + memcpy(&value, &bitmap->buffer.data[i], sizeof(size_t)); + result = ~value; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap->buffer.size_bytes; i++) { out->buffer.data[i] = ~bitmap->buffer.data[i]; } diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py index 9141587d27171..0144b8df55f38 100644 --- a/pandas/tests/arrays/masked/test_bitmask.py +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -192,7 +192,22 @@ def test_invert(): assert ((result2.bytes[0] >> 1) & 0x1) == 0 -@pytest.mark.parametrize("rhs_as_bitmask", [True, False]) +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], [True], bytes([0x1])), + ([True], [False], bytes([0x0])), + ([False], [False], bytes([0x0])), + ([True] * 10, [True] * 10, bytes([0xFF, 0x3])), + ([False] * 10, [True] * 10, bytes([0x0, 0x0])), + ], +) +def test_and_bitmask(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + result = bma1 & BitmaskArray(np.array(rhs)) + assert result.bytes == expected + + @pytest.mark.parametrize( "lhs,rhs,expected", [ @@ -203,44 +218,115 @@ def test_invert(): ([False] * 10, [True] * 10, [False] * 10), ], ) -def test_and(rhs_as_bitmask, lhs, rhs, expected): +def test_and_ndarray(lhs, rhs, expected): bma1 = BitmaskArray(np.array(lhs)) - if rhs_as_bitmask: - bma2 = BitmaskArray(np.array(rhs)) - else: - bma2 = np.array(rhs) + result = bma1 & np.array(rhs) + assert (result == np.array(expected)).all() - expected = np.array(expected) - result = bma1 & bma2 - assert (result == expected).all() + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], True, bytes([0x1])), + ([True], False, bytes([0x0])), + ([False], False, bytes([0x0])), + ([True] * 10, True, bytes([0xFF, 0x3])), + ([False] * 10, True, bytes([0x0, 0x0])), + ], +) +def test_and_scalar(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + result = bma1 & rhs + + # We don't really care about the bits that + # exist beyond the length of the bitmask, but + # to make testing easy we assume XOR still operates + # on them. Might be better to implement equality + # on bitmaskarray and test instead of looking at bytes + assert result.bytes == expected + + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], [True], bytes([0x1])), + ([True], [False], bytes([0x1])), + ([False], [False], bytes([0x0])), + ([True] * 10, [True] * 10, bytes([0xFF, 0x3])), + ([False] * 10, [True] * 10, bytes([0xFF, 0x3])), + ], +) +def test_or_bitmask(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + result = bma1 | BitmaskArray(np.array(rhs)) + assert result.bytes == expected -@pytest.mark.parametrize("rhs_as_bitmask", [True, False]) @pytest.mark.parametrize( "lhs,rhs,expected", [ ([True], [True], [True]), - ([True], [False], [True]), - ([False], [False], [False]), + ( + [True], + [False], + [True], + ), + ( + [False], + [False], + [False], + ), ([True] * 10, [True] * 10, [True] * 10), ([False] * 10, [True] * 10, [True] * 10), ], ) -def test_or(rhs_as_bitmask, lhs, rhs, expected): +def test_or_ndarray(lhs, rhs, expected): bma1 = BitmaskArray(np.array(lhs)) - if rhs_as_bitmask: - bma2 = BitmaskArray(np.array(rhs)) - else: - bma2 = np.array(rhs) + result = bma1 | np.array(rhs) + assert (result == np.array(expected)).all() - expected = np.array(expected) - result = bma1 | bma2 - assert (result == expected).all() + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], True, bytes([0xFF])), + ([True], False, bytes([0x1])), + ([False], False, bytes([0x0])), + ([True] * 10, True, bytes([0xFF, 0xFF])), + ([False] * 10, True, bytes([0xFF, 0xFF])), + ], +) +def test_or_scalar(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + result = bma1 | rhs + + # We don't really care about the bits that + # exist beyond the length of the bitmask, but + # to make testing easy we assume XOR still operates + # on them. Might be better to implement equality + # on bitmaskarray and test instead of looking at bytes + assert result.bytes == expected + + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], [True], bytes([0x0])), + ([True], [False], bytes([0x1])), + ([False], [False], bytes([0x0])), + ([True] * 10, [True] * 10, bytes([0x0, 0x0])), + ([False] * 10, [True] * 10, bytes([0xFF, 0x3])), + ], +) +def test_xor_bitmask(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + other = BitmaskArray(np.array(rhs)) + result = bma1 ^ other + assert result.bytes == expected -@pytest.mark.parametrize("rhs_as_bitmask", [True, False]) @pytest.mark.parametrize( "lhs,rhs,expected", [ @@ -251,17 +337,34 @@ def test_or(rhs_as_bitmask, lhs, rhs, expected): ([False] * 10, [True] * 10, [True] * 10), ], ) -def test_xor(rhs_as_bitmask, lhs, rhs, expected): +def test_xor_ndarray(lhs, rhs, expected): bma1 = BitmaskArray(np.array(lhs)) + other = np.array(rhs) + result = bma1 ^ other + assert (result == np.array(expected)).all() - if rhs_as_bitmask: - bma2 = BitmaskArray(np.array(rhs)) - else: - bma2 = np.array(rhs) - expected = np.array(expected) - result = bma1 ^ bma2 - assert (result == expected).all() +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], True, bytes([0xFE])), + ([True], False, bytes([0x1])), + ([False], False, bytes([0x0])), + ([True] * 10, True, bytes([0x0, 0xFC])), + ([False] * 10, True, bytes([0xFF, 0xFF])), + ], +) +def test_xor_scalar(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + other = rhs + result = bma1 ^ other + + # We don't really care about the bits that + # exist beyond the length of the bitmask, but + # to make testing easy we assume XOR still operates + # on them. Might be better to implement equality + # on bitmaskarray and test instead of looking at bytes + assert result.bytes == expected def test_pickle(): From bc772c302a591ab200bc6982b1f80465eaff245e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 30 Aug 2023 16:45:26 -0400 Subject: [PATCH 115/126] bugfix with all refactor --- pandas/_libs/src/bitmask_algorithms.c | 13 +++++++++++-- pandas/tests/arrays/masked/test_bitmask.py | 18 ++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index 14d27b206838b..ccb682549d16a 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -69,7 +69,9 @@ void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, } bool BitmapAny(const struct ArrowBitmap *bitmap) { - if (bitmap->size_bits < 1) { + const size_t nbits = bitmap->size_bits; + const size_t size_bytes = bitmap->buffer.size_bytes; + if (nbits < 1) { return false; } @@ -83,12 +85,19 @@ bool BitmapAny(const struct ArrowBitmap *bitmap) { } } - for (; i < bitmap->buffer.size_bytes; i++) { + for (; i < bitmap->buffer.size_bytes - 1; i++) { if (bitmap->buffer.data[i] != 0x0) { return true; } } + const size_t bits_remaining = nbits - ((size_bytes - 1) * 8); + for (size_t i = 0; i < bits_remaining; i++) { + if (ArrowBitGet(bitmap->buffer.data, nbits - i - 1) == 1) { + return true; + } + } + return false; } diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py index 0144b8df55f38..d895618ba3483 100644 --- a/pandas/tests/arrays/masked/test_bitmask.py +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -450,6 +450,15 @@ def test_any(data, expected): assert bma.any() == expected +def test_any_sliced_bitmask(): + # Need to ensure any doesn't look beyond bounds of slice + bma = BitmaskArray(np.array([False, False, True, True])) + assert bma.any() + + new_bma = bma[:2] + assert not new_bma.any() + + @pytest.mark.parametrize( "data,expected", [ @@ -464,6 +473,15 @@ def test_all(data, expected): assert bma.all() == expected +def test_all_sliced_bitmask(): + # Need to ensure all doesn't look beyond bounds of slice + bma = BitmaskArray(np.array([True, True, False, False])) + assert not bma.all() + + new_bma = bma[:2] + assert new_bma.all() + + @pytest.mark.parametrize( "data,expected", [ From 1c637a16458af2d62cc3b7b4d198b14daa013c4c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 30 Aug 2023 17:20:45 -0400 Subject: [PATCH 116/126] less to_numpy() --- pandas/core/algorithms.py | 4 +--- pandas/core/arrays/masked.py | 15 ++++----------- pandas/core/arrays/string_.py | 2 +- pandas/core/arrays/string_arrow.py | 4 ++-- pandas/core/arrays/timedeltas.py | 2 +- 5 files changed, 9 insertions(+), 18 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 53a75754a49c2..9769e37829ec8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1005,9 +1005,7 @@ def duplicated( if isinstance(values.dtype, BaseMaskedDtype): values = cast("BaseMaskedArray", values) - return htable.duplicated( - values._data, keep=keep, mask=values._mask.to_numpy() - ) + return htable.duplicated(values._data, keep=keep, mask=values._mask) values = _ensure_data(values) return htable.duplicated(values, keep=keep) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index da5f501e9b3f7..8883095b56b09 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -308,16 +308,12 @@ def __setitem__(self, key, value) -> None: value, mask = self._coerce_to_array(value, dtype=self.dtype) self._data[key] = value - if isinstance(mask, BitmaskArray): - mask = mask.to_numpy() - self._mask[key] = mask def __contains__(self, key) -> bool: if isna(key) and key is not self.dtype.na_value: # GH#52840 if self._data.dtype.kind == "f" and lib.is_float(key): - # TODO: implement low level invert operator on BitmaskArray return bool((np.isnan(self._data) & ~self._mask).any()) return bool(super().__contains__(key)) @@ -402,7 +398,7 @@ def round(self, decimals: int = 0, *args, **kwargs): values = np.round(self._data, decimals=decimals, **kwargs) # Usually we'll get same type as self, but ndarray[bool] casts to float - return self._maybe_mask_result(values, self._mask) + return self._maybe_mask_result(values, self._mask.copy()) # ------------------------------------------------------------------ # Unary Methods @@ -1378,14 +1374,12 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ nv.validate_any((), kwargs) - # attempt to avoid to_numpy call on mask for best performance is_all_na = self._mask.all() - is_any_na = self._mask.any() if len(self) == 0 or (skipna and is_all_na): return np.bool_(False) + is_any_na = self._mask.any() if is_any_na: - # fallback to numpy - will be slower values = self._data.copy() # error: Argument 3 to "putmask" has incompatible type "object"; # expected "Union[_SupportsArray[dtype[Any]], @@ -1474,12 +1468,11 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ nv.validate_all((), kwargs) - # attempt to avoid to_numpy call on mask for best performance is_all_na = self._mask.all() - is_any_na = self._mask.any() if len(self) == 0 or (skipna and is_all_na): return np.bool_(True) + is_any_na = self._mask.any() if is_any_na: values = self._data.copy() # error: Argument 3 to "putmask" has incompatible type "object"; @@ -1546,7 +1539,7 @@ def _groupby_op( min_count=min_count, ngroups=ngroups, comp_ids=ids, - mask=mask.to_numpy(), + mask=mask, result_mask=result_mask, **kwargs, ) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 967039c1498fc..72ba95e5fa258 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -372,7 +372,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype - na_values = scalars._mask.to_numpy() + na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) result[na_values] = libmissing.NA diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 607c549996eb2..f4f64e20a7977 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -155,10 +155,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and # numerical issues with Float32Dtype - na_values = scalars._mask.to_numpy() + na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - return cls(pa.array(result, mask=na_values, type=pa.string())) + return cls(pa.array(result, mask=na_values.to_numpy(), type=pa.string())) elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): return cls(pc.cast(scalars, pa.string())) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a8a6e028b11f9..b7b81b8271106 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -1071,7 +1071,7 @@ def sequence_to_td64ns( # cast the unit, multiply base/frac separately # to avoid precision issues from float -> int if isinstance(data.dtype, ExtensionDtype): - mask = data._mask.to_numpy() + mask = data._mask data = data._data else: mask = np.isnan(data) From 3dfe66823c8d9546b2d5bcece9608130e5b60c31 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 30 Aug 2023 17:37:58 -0400 Subject: [PATCH 117/126] Error message cleanups --- pandas/_libs/arrays.pyx | 32 +++++++++--- pandas/_libs/src/bitmask_algorithms.c | 71 ++++++++++++++++++--------- 2 files changed, 73 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index ad421e06b1864..fd8382c13e6c3 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -521,11 +521,14 @@ cdef class BitmaskArray: cdef BitmaskArray self_ = self cdef BitmaskArray bma = BitmaskArray.__new__(BitmaskArray) cdef ArrowBitmap bitmap + cdef int ret ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) - BitmapInvert(&self_.bitmap, &bitmap) + ret = BitmapInvert(&self_.bitmap, &bitmap) + if ret == -1: + raise RuntimeError("BitmapInvert failed") bma.bitmap = bitmap bma.buffer_owner = True @@ -541,6 +544,7 @@ cdef class BitmaskArray: cdef BitmaskArray bma cdef ArrowBitmap bitmap cdef bint bval + cdef int ret if isinstance(other, BitmaskArray): # TODO: maybe should return Self here instead of ndarray @@ -550,7 +554,9 @@ cdef class BitmaskArray: ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) - BitmapAnd(&self_.bitmap, &other_bma.bitmap, &bitmap) + ret = BitmapAnd(&self_.bitmap, &other_bma.bitmap, &bitmap) + if ret == -1: + raise RuntimeError("BitmapAnd failed") bma = BitmaskArray.__new__(BitmaskArray) bma.bitmap = bitmap @@ -564,7 +570,9 @@ cdef class BitmaskArray: bval = other ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) - BitmapAndBool(&self_.bitmap, bval, &bitmap) + ret = BitmapAndBool(&self_.bitmap, bval, &bitmap) + if ret == -1: + raise RuntimeError("BitmapAndBool failed") bma = BitmaskArray.__new__(BitmaskArray) bma.bitmap = bitmap @@ -583,6 +591,7 @@ cdef class BitmaskArray: cdef BitmaskArray bma cdef ArrowBitmap bitmap cdef bint bval + cdef int ret if isinstance(other, BitmaskArray): other_bma = other @@ -597,7 +606,9 @@ cdef class BitmaskArray: ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) - BitmapOr(&self_.bitmap, &other_bma.bitmap, &bitmap) + ret = BitmapOr(&self_.bitmap, &other_bma.bitmap, &bitmap) + if ret == -1: + raise RuntimeError("BitmapOr failed") bma = BitmaskArray.__new__(BitmaskArray) bma.bitmap = bitmap @@ -611,7 +622,9 @@ cdef class BitmaskArray: bval = other ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) - BitmapOrBool(&self_.bitmap, bval, &bitmap) + ret = BitmapOrBool(&self_.bitmap, bval, &bitmap) + if ret == -1: + raise RuntimeError("BitmapOrBool failed") bma = BitmaskArray.__new__(BitmaskArray) bma.bitmap = bitmap @@ -629,6 +642,7 @@ cdef class BitmaskArray: cdef BitmaskArray bma cdef ArrowBitmap bitmap cdef bint bval + cdef int ret if isinstance(other, BitmaskArray): # TODO: maybe should return Self here instead of ndarray @@ -638,7 +652,9 @@ cdef class BitmaskArray: ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) - BitmapXor(&self_.bitmap, &other_bma.bitmap, &bitmap) + ret = BitmapXor(&self_.bitmap, &other_bma.bitmap, &bitmap) + if ret == -1: + raise RuntimeError("BitmapXor failed") bma = BitmaskArray.__new__(BitmaskArray) bma.bitmap = bitmap @@ -652,7 +668,9 @@ cdef class BitmaskArray: bval = other ArrowBitmapInit(&bitmap) ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) - BitmapXorBool(&self_.bitmap, bval, &bitmap) + ret = BitmapXorBool(&self_.bitmap, bval, &bitmap) + if ret == -1: + raise RuntimeError("BitmapXorBool failed") bma = BitmaskArray.__new__(BitmaskArray) bma.bitmap = bitmap diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index ccb682549d16a..399705fcbbdd6 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -75,9 +75,11 @@ bool BitmapAny(const struct ArrowBitmap *bitmap) { return false; } + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; size_t i = 0; - for (; i + sizeof(size_t) - 1 < bitmap->buffer.size_bytes; - i += sizeof(size_t)) { + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { size_t value; memcpy(&value, &bitmap->buffer.data[i], sizeof(size_t)); if (value != 0x0) { @@ -108,9 +110,11 @@ bool BitmapAll(const struct ArrowBitmap *bitmap) { return true; } + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; size_t i = 0; - for (; i + sizeof(size_t) - 1 < bitmap->buffer.size_bytes; - i += sizeof(size_t)) { + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { size_t value; memcpy(&value, &bitmap->buffer.data[i], sizeof(size_t)); if (value != SIZE_MAX) { @@ -136,15 +140,18 @@ bool BitmapAll(const struct ArrowBitmap *bitmap) { int BitmapOr(const struct ArrowBitmap *bitmap1, const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out) { + const size_t size_bytes = bitmap1->buffer.size_bytes; if (bitmap1->size_bits != bitmap2->size_bits) { return -1; - } else if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + } else if (!(out->buffer.capacity_bytes >= size_bytes)) { return -1; } + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; size_t i = 0; - for (; i + sizeof(size_t) - 1 < bitmap1->buffer.size_bytes; - i += sizeof(size_t)) { + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { size_t value1; size_t value2; size_t result; @@ -166,16 +173,19 @@ int BitmapOr(const struct ArrowBitmap *bitmap1, int BitmapOrBool(const struct ArrowBitmap *bitmap1, bool other, struct ArrowBitmap *out) { - if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + const size_t size_bytes = bitmap1->buffer.size_bytes; + if (!(out->buffer.capacity_bytes >= size_bytes)) { return -1; } const size_t mask = other ? SIZE_MAX : 0; const uint8_t umask = other ? UINT8_MAX : 0; + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; size_t i = 0; - for (; i + sizeof(size_t) - 1 < bitmap1->buffer.size_bytes; - i += sizeof(size_t)) { + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { size_t value1; size_t result; memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); @@ -195,15 +205,18 @@ int BitmapOrBool(const struct ArrowBitmap *bitmap1, bool other, int BitmapAnd(const struct ArrowBitmap *bitmap1, const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out) { + const size_t size_bytes = bitmap1->buffer.size_bytes; if (bitmap1->size_bits != bitmap2->size_bits) { return -1; - } else if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + } else if (!(out->buffer.capacity_bytes >= size_bytes)) { return -1; } + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; size_t i = 0; - for (; i + sizeof(size_t) - 1 < bitmap1->buffer.size_bytes; - i += sizeof(size_t)) { + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { size_t value1; size_t value2; size_t result; @@ -225,6 +238,7 @@ int BitmapAnd(const struct ArrowBitmap *bitmap1, int BitmapAndBool(const struct ArrowBitmap *bitmap1, bool other, struct ArrowBitmap *out) { + const size_t size_bytes = bitmap1->buffer.size_bytes; if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { return -1; } @@ -232,9 +246,11 @@ int BitmapAndBool(const struct ArrowBitmap *bitmap1, bool other, const size_t mask = other ? SIZE_MAX : 0; const uint8_t umask = other ? UINT8_MAX : 0; + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; size_t i = 0; - for (; i + sizeof(size_t) - 1 < bitmap1->buffer.size_bytes; - i += sizeof(size_t)) { + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { size_t value1; size_t result; memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); @@ -254,15 +270,18 @@ int BitmapAndBool(const struct ArrowBitmap *bitmap1, bool other, int BitmapXor(const struct ArrowBitmap *bitmap1, const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out) { + const size_t size_bytes = bitmap1->buffer.size_bytes; if (bitmap1->size_bits != bitmap2->size_bits) { return -1; - } else if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + } else if (!(out->buffer.capacity_bytes >= size_bytes)) { return -1; } + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; size_t i = 0; - for (; i + sizeof(size_t) - 1 < bitmap1->buffer.size_bytes; - i += sizeof(size_t)) { + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { size_t value1; size_t value2; size_t result; @@ -284,6 +303,7 @@ int BitmapXor(const struct ArrowBitmap *bitmap1, int BitmapXorBool(const struct ArrowBitmap *bitmap1, bool other, struct ArrowBitmap *out) { + const size_t size_bytes = bitmap1->buffer.size_bytes; if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { return -1; } @@ -291,9 +311,11 @@ int BitmapXorBool(const struct ArrowBitmap *bitmap1, bool other, const size_t mask = other ? SIZE_MAX : 0; const uint8_t umask = other ? UINT8_MAX : 0; + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; size_t i = 0; - for (; i + sizeof(size_t) - 1 < bitmap1->buffer.size_bytes; - i += sizeof(size_t)) { + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { size_t value1; size_t result; memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); @@ -312,13 +334,16 @@ int BitmapXorBool(const struct ArrowBitmap *bitmap1, bool other, } int BitmapInvert(const struct ArrowBitmap *bitmap, struct ArrowBitmap *out) { - if (!(out->buffer.capacity_bytes >= bitmap->buffer.size_bytes)) { + const size_t size_bytes = bitmap->buffer.size_bytes; + if (!(out->buffer.capacity_bytes >= size_bytes)) { return -1; } + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; size_t i = 0; - for (; i + sizeof(size_t) - 1 < bitmap->buffer.size_bytes; - i += sizeof(size_t)) { + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { size_t value; size_t result; memcpy(&value, &bitmap->buffer.data[i], sizeof(size_t)); From 5e9f08c83b330b25b9e1e1520eddb4bf1af3a3f9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 30 Aug 2023 17:42:49 -0400 Subject: [PATCH 118/126] re-enable cpplint --- .pre-commit-config.yaml | 16 ++++++++++++++++ pandas/_libs/include/pandas/bitmask_algorithms.h | 11 ++++++++++- pandas/_libs/src/bitmask_algorithms.c | 12 +++++++++++- 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f024245009d71..f73eba7070025 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -73,6 +73,22 @@ repos: - id: fix-encoding-pragma args: [--remove] - id: trailing-whitespace +- repo: https://github.com/cpplint/cpplint + rev: 1.6.1 + hooks: + - id: cpplint + exclude: | + ^pandas/_libs/include/pandas/vendored/klib + |pandas/_libs/include/pandas/vendored/nanoarrow.h + |pandas/_libs/src/vendored/nanoarrow.c + args: [ + --quiet, + '--extensions=c,h', + '--headers=h', + --recursive, + --linelength=88, + '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' + ] - repo: https://github.com/pylint-dev/pylint rev: v3.0.0a6 hooks: diff --git a/pandas/_libs/include/pandas/bitmask_algorithms.h b/pandas/_libs/include/pandas/bitmask_algorithms.h index 8598b5ebdd568..fa70b1a472fc4 100644 --- a/pandas/_libs/include/pandas/bitmask_algorithms.h +++ b/pandas/_libs/include/pandas/bitmask_algorithms.h @@ -1,4 +1,13 @@ -// The full license is in the LICENSE file, distributed with this software. +/* + +Copyright (c) 2023, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. + +*/ #pragma once diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c index 399705fcbbdd6..6b944729445d0 100644 --- a/pandas/_libs/src/bitmask_algorithms.c +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -1,4 +1,14 @@ -// The full license is in the LICENSE file, distributed with this software. +/* + +Copyright (c) 2023, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. + +*/ + #include From 97da641b90b12ef4c32358e25c5dfc23ae4b71c2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 30 Aug 2023 17:57:37 -0400 Subject: [PATCH 119/126] updated pre-commit --- .pre-commit-config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f73eba7070025..52707d305b886 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -47,6 +47,7 @@ repos: types_or: [python, rst, markdown, cython, c] additional_dependencies: [tomli] exclude: | + (?x) ^pandas/_libs/include/pandas/vendored/nanoarrow.h |pandas/_libs/src/vendored/nanoarrow.c - repo: https://github.com/MarcoGorelli/cython-lint @@ -78,6 +79,7 @@ repos: hooks: - id: cpplint exclude: | + (?x) ^pandas/_libs/include/pandas/vendored/klib |pandas/_libs/include/pandas/vendored/nanoarrow.h |pandas/_libs/src/vendored/nanoarrow.c From a3dca8abd0c1b674e48d6f8f89eed094678e5876 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 30 Aug 2023 19:17:28 -0400 Subject: [PATCH 120/126] Fix typing issues --- pandas/_libs/hashtable.pyi | 2 +- pandas/core/groupby/ops.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index cd51875eb3df1..3e219e4974a89 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -232,7 +232,7 @@ class IntpHashTable(HashTable): ... def duplicated( values: np.ndarray, keep: Literal["last", "first", False] = ..., - mask: npt.NDArray[np.bool_] | None = ..., + mask: npt.NDArray[np.bool_] | BitmaskArray | None = ..., ) -> npt.NDArray[np.bool_]: ... def mode( values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = ... diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 71525c8c1a223..2ba913cf1de65 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -22,6 +22,7 @@ NaT, lib, ) +from pandas._libs.arrays import BitmaskArray import pandas._libs.groupby as libgroupby from pandas._typing import ( ArrayLike, @@ -309,11 +310,13 @@ def _cython_op_ndim_compat( min_count: int, ngroups: int, comp_ids: np.ndarray, - mask: npt.NDArray[np.bool_] | None = None, + mask: npt.NDArray[np.bool_] | BitmaskArray | None = None, result_mask: npt.NDArray[np.bool_] | None = None, **kwargs, ) -> np.ndarray: if values.ndim == 1: + if isinstance(mask, BitmaskArray): + mask = mask.to_numpy() # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] if mask is not None: @@ -353,7 +356,7 @@ def _call_cython_op( min_count: int, ngroups: int, comp_ids: np.ndarray, - mask: npt.NDArray[np.bool_] | None, + mask: npt.NDArray[np.bool_] | BitmaskArray | None, result_mask: npt.NDArray[np.bool_] | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] @@ -387,6 +390,9 @@ def _call_cython_op( values = values.T if mask is not None: + if isinstance(mask, BitmaskArray): + mask = mask.to_numpy() + mask = mask.T if result_mask is not None: result_mask = result_mask.T From 1f77d9a7ed58f01a7e04cd849ac6c50000e46ea7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 30 Aug 2023 21:50:43 -0400 Subject: [PATCH 121/126] more cleanups --- pandas/core/algorithms.py | 9 +++++++-- pandas/core/arrays/masked.py | 5 +++-- pandas/core/tools/numeric.py | 12 +++++++----- pandas/tests/arrays/categorical/test_astype.py | 9 --------- pandas/tests/arrays/floating/test_arithmetic.py | 3 +-- 5 files changed, 18 insertions(+), 20 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9769e37829ec8..ba3d9fca3fea1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -443,9 +443,14 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | BitmaskArray | None = return uniques else: - uniques, np_mask = table.unique(values, mask=mask) + uniques, mask = table.unique(values, mask=mask) uniques = _reconstruct_data(uniques, original.dtype, original) - return uniques, np_mask.astype("bool") + + if isinstance(mask, BitmaskArray): + mask = mask.to_numpy() + + assert mask is not None # for mypy + return uniques, mask.astype("bool") unique1d = unique diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8883095b56b09..a74a803d862e5 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1151,6 +1151,7 @@ def _quantile( def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): + mask: BitmaskArray | np.ndarray if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}: result = getattr(self, name)(skipna=skipna, **kwargs) else: @@ -1166,8 +1167,8 @@ def _reduce( return self._wrap_na_result(name=name, axis=0, mask_size=(1,)) else: result = result.reshape(1) - np_mask = np.zeros(1, dtype=bool) - return self._maybe_mask_result(result, np_mask) + mask = np.zeros(1, dtype=bool) + return self._maybe_mask_result(result, mask) if isna(result): return libmissing.NA diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 23dc964246b91..28612e7aef6ef 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -8,6 +8,7 @@ import numpy as np from pandas._libs import lib +from pandas._libs.arrays import BitmaskArray from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.cast import maybe_downcast_numeric @@ -201,10 +202,10 @@ def to_numeric( # GH33013: for IntegerArray & FloatingArray extract non-null values for casting # save mask to reconstruct the full array after casting - mask: npt.NDArray[np.bool_] | None = None + mask: npt.NDArray[np.bool_] | BitmaskArray | None = None if isinstance(values, BaseMaskedArray): - mask = values._mask.to_numpy() - values = values._data[~mask] + mask = values._mask + values = values._data[~mask] # type: ignore[call-overload] values_dtype = getattr(values, "dtype", None) if isinstance(values_dtype, ArrowDtype): @@ -278,8 +279,9 @@ def to_numeric( if mask is None or (new_mask is not None and new_mask.shape == mask.shape): # GH 52588 mask = new_mask - - assert isinstance(mask, np.ndarray) + else: + mask = mask.copy() + assert isinstance(mask, (np.ndarray, BitmaskArray)) data = np.zeros(mask.shape, dtype=values.dtype) data[~mask] = values diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index ace785e6ae5c8..94b095fc0fa91 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -145,12 +145,3 @@ def test_astype_object_timestamp_categories(self): result = cat.astype(object) expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object") tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.skip(reason="Not applicable with bitmask backed arrays") - def test_astype_category_readonly_mask_values(self): - # GH#53658 - arr = array([0, 1, 2], dtype="Int64") - arr._mask.flags["WRITEABLE"] = False - result = arr.astype("category") - expected = array([0, 1, 2], dtype="Int64").astype("category") - tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 052f38dfce5af..056c22d8c1131 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -88,8 +88,7 @@ def test_pow_scalar(dtype): result = np.nan**a expected = FloatingArray( - np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), - mask=a._mask, + np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask ) tm.assert_extension_array_equal(result, expected) From 6a56ec1884ba861205fbf1f152b7b78d7f229040 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 6 Sep 2023 16:22:30 -0400 Subject: [PATCH 122/126] remove cast --- pandas/core/algorithms.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ac3c6830fa006..565a7f3b36cae 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -446,11 +446,8 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | BitmaskArray | None = uniques, mask = table.unique(values, mask=mask) uniques = _reconstruct_data(uniques, original.dtype, original) - if isinstance(mask, BitmaskArray): - mask = mask.to_numpy() - assert mask is not None # for mypy - return uniques, mask.astype("bool") + return uniques, mask unique1d = unique From 23fb76d427ba6d03e80be0dfff88b06ae2dde16c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 6 Sep 2023 17:11:11 -0400 Subject: [PATCH 123/126] less diff --- pandas/core/algorithms.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 565a7f3b36cae..de9e6a894669b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -445,7 +445,6 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | BitmaskArray | None = else: uniques, mask = table.unique(values, mask=mask) uniques = _reconstruct_data(uniques, original.dtype, original) - assert mask is not None # for mypy return uniques, mask From 3fb26ec41603c2d2b5b713a81536ef91eb9e3b7d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 15 Sep 2023 11:54:32 -0400 Subject: [PATCH 124/126] reverted cythonized is_null_slice --- pandas/_libs/arrays.pyx | 2 +- pandas/_libs/lib.pyi | 1 - pandas/_libs/lib.pyx | 19 ------------------- pandas/core/common.py | 7 ++++++- 4 files changed, 7 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index fd8382c13e6c3..8f0f0f58db983 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -20,7 +20,7 @@ from numpy cimport ( uint8_t, ) -from pandas._libs.lib import is_null_slice +from pandas.core.common import is_null_slice cnp.import_array() diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 946cbb7dbf6f1..15bd5a7379105 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -44,7 +44,6 @@ def is_iterator(obj: object) -> bool: ... def is_scalar(val: object) -> bool: ... def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... def is_pyarrow_array(obj: object) -> bool: ... -def is_null_slice(obj: object) -> bool: ... def is_period(val: object) -> TypeGuard[Period]: ... def is_interval(val: object) -> TypeGuard[Interval]: ... def is_decimal(val: object) -> TypeGuard[Decimal]: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a9e23b95d9b7d..0c0610f72044e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -29,7 +29,6 @@ from cpython.object cimport ( ) from cpython.ref cimport Py_INCREF from cpython.sequence cimport PySequence_Check -from cpython.slice cimport PySlice_Unpack from cpython.tuple cimport ( PyTuple_New, PyTuple_SET_ITEM, @@ -72,7 +71,6 @@ cdef extern from "Python.h": # Note: importing extern-style allows us to declare these as nogil # functions, whereas `from cpython cimport` does not. bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil - cdef Py_ssize_t PY_SSIZE_T_MAX cdef extern from "numpy/arrayobject.h": # cython's numpy.dtype specification is incorrect, which leads to @@ -1252,23 +1250,6 @@ def is_pyarrow_array(obj): return False -def is_null_slice(obj): - """ - Return True if given object - """ - cdef Py_ssize_t start, stop, step - if isinstance(obj, slice): - try: - PySlice_Unpack(obj, &start, &stop, &step) - except TypeError: - return False - - if start == 0 and stop == PY_SSIZE_T_MAX and step == 1: - return True - - return False - - _TYPE_MAP = { "categorical": "categorical", "category": "categorical", diff --git a/pandas/core/common.py b/pandas/core/common.py index 2b243ec21818d..8fd8b10c6fc32 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -307,7 +307,12 @@ def is_null_slice(obj) -> bool: """ We have a null slice. """ - return lib.is_null_slice(obj) + return ( + isinstance(obj, slice) + and obj.start is None + and obj.stop is None + and obj.step is None + ) def is_empty_slice(obj) -> bool: From 541de2e9c1b27445668bf331aa510cb1b4e7dc4b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 15 Sep 2023 11:55:22 -0400 Subject: [PATCH 125/126] remove xfail of test --- pandas/tests/frame/indexing/test_where.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index d08701bf5035c..1eb67671da0b8 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas.compat.numpy import np_version_gte1p24 - from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -707,11 +705,6 @@ def test_where_categorical_filtering(self): tm.assert_equal(result, expected) - @pytest.mark.xfail( - not np_version_gte1p24, - reason="Changed NumPy behavior for >1D non-tuple sequence indexing", - strict=False, - ) def test_where_ea_other(self): # GH#38729/GH#38742 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) From 34bc194b199735c842439c4f32ff3f06a3bc4e0d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 15 Sep 2023 11:59:54 -0400 Subject: [PATCH 126/126] change assert to ignore --- pandas/core/arrays/masked.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index a74a803d862e5..1fc1dcb5578aa 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -198,8 +198,7 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self.dtype.na_value return self._data[item] - assert not isinstance(newmask, bool) # for mypy - return self._simple_new(self._data[item], newmask) + return self._simple_new(self._data[item], newmask) # type: ignore[arg-type] def _pad_or_backfill( self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True