Skip to content

Commit ba1f992

Browse files
authored
GH-34659: [C++] Review the validation processes around Run-End Encoded arrays to improve the Python integration (#34628)
All the C++ commits from #34570 to simplify review and CI debugging. * Closes: #34659 Authored-by: Felipe Oliveira Carvalho <[email protected]> Signed-off-by: Matt Topol <[email protected]>
1 parent 69118b2 commit ba1f992

File tree

10 files changed

+232
-136
lines changed

10 files changed

+232
-136
lines changed

cpp/src/arrow/api.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#pragma once
2121

2222
#include "arrow/array.h" // IYWU pragma: export
23+
#include "arrow/array/array_run_end.h" // IYWU pragma: export
2324
#include "arrow/array/concatenate.h" // IYWU pragma: export
2425
#include "arrow/buffer.h" // IYWU pragma: export
2526
#include "arrow/builder.h" // IYWU pragma: export

cpp/src/arrow/array/array_run_end.cc

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,24 +41,30 @@ RunEndEncodedArray::RunEndEncodedArray(const std::shared_ptr<DataType>& type,
4141
/*null_count=*/0, offset));
4242
}
4343

44+
Result<std::shared_ptr<RunEndEncodedArray>> RunEndEncodedArray::Make(
45+
const std::shared_ptr<DataType>& type, int64_t logical_length,
46+
const std::shared_ptr<Array>& run_ends, const std::shared_ptr<Array>& values,
47+
int64_t logical_offset) {
48+
if (type->id() != Type::RUN_END_ENCODED) {
49+
return Status::Invalid("Type must be RUN_END_ENCODED");
50+
}
51+
const auto* ree_type = internal::checked_cast<const RunEndEncodedType*>(type.get());
52+
RETURN_NOT_OK(ree_util::ValidateRunEndEncodedChildren(
53+
*ree_type, logical_length, run_ends->data(), values->data(), 0, logical_offset));
54+
return std::make_shared<RunEndEncodedArray>(type, logical_length, run_ends, values,
55+
logical_offset);
56+
}
57+
4458
Result<std::shared_ptr<RunEndEncodedArray>> RunEndEncodedArray::Make(
4559
int64_t logical_length, const std::shared_ptr<Array>& run_ends,
4660
const std::shared_ptr<Array>& values, int64_t logical_offset) {
4761
auto run_end_type = run_ends->type();
48-
auto values_type = values->type();
62+
auto value_type = values->type();
4963
if (!RunEndEncodedType::RunEndTypeValid(*run_end_type)) {
5064
return Status::Invalid("Run end type must be int16, int32 or int64");
5165
}
52-
if (run_ends->null_count() != 0) {
53-
return Status::Invalid("Run ends array cannot contain null values");
54-
}
55-
if (values->length() < run_ends->length()) {
56-
return Status::Invalid("Values array has to be at least as long as run ends array");
57-
}
58-
59-
return std::make_shared<RunEndEncodedArray>(
60-
run_end_encoded(std::move(run_end_type), std::move(values_type)), logical_length,
61-
run_ends, values, logical_offset);
66+
auto ree_type = run_end_encoded(std::move(run_end_type), std::move(value_type));
67+
return Make(ree_type, logical_length, run_ends, values, logical_offset);
6268
}
6369

6470
void RunEndEncodedArray::SetData(const std::shared_ptr<ArrayData>& data) {

cpp/src/arrow/array/array_run_end.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,17 @@ class ARROW_EXPORT RunEndEncodedArray : public Array {
6565
const std::shared_ptr<Array>& run_ends,
6666
const std::shared_ptr<Array>& values, int64_t offset = 0);
6767

68+
/// \brief Construct a RunEndEncodedArray from all parameters
69+
///
70+
/// The length and offset parameters refer to the dimensions of the logical
71+
/// array which is the array we would get after expanding all the runs into
72+
/// repeated values. As such, length can be much greater than the lenght of
73+
/// the child run_ends and values arrays.
74+
static Result<std::shared_ptr<RunEndEncodedArray>> Make(
75+
const std::shared_ptr<DataType>& type, int64_t logical_length,
76+
const std::shared_ptr<Array>& run_ends, const std::shared_ptr<Array>& values,
77+
int64_t logical_offset = 0);
78+
6879
/// \brief Construct a RunEndEncodedArray from values and run ends arrays
6980
///
7081
/// The data type is automatically inferred from the arguments.

cpp/src/arrow/array/array_run_end_test.cc

Lines changed: 77 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -112,14 +112,18 @@ TEST_P(TestRunEndEncodedArray, FromRunEndsAndValues) {
112112
ASSERT_EQ(ree_array->data()->null_count, 0);
113113
ASSERT_EQ(ree_array->offset(), 1);
114114

115-
ASSERT_RAISES_WITH_MESSAGE(Invalid,
116-
"Invalid: Run end type must be int16, int32 or int64",
117-
RunEndEncodedArray::Make(30, string_values, int32_values));
118-
ASSERT_RAISES_WITH_MESSAGE(
119-
Invalid, "Invalid: Run ends array cannot contain null values",
115+
EXPECT_RAISES_WITH_MESSAGE_THAT(
116+
Invalid,
117+
::testing::HasSubstr("Invalid: Run end type must be int16, int32 or int64"),
118+
RunEndEncodedArray::Make(30, string_values, int32_values));
119+
EXPECT_RAISES_WITH_MESSAGE_THAT(
120+
Invalid,
121+
::testing::HasSubstr("Invalid: Null count must be 0 for run ends array, but is 3"),
120122
RunEndEncodedArray::Make(30, run_end_only_null, int32_values));
121-
ASSERT_RAISES_WITH_MESSAGE(
122-
Invalid, "Invalid: Values array has to be at least as long as run ends array",
123+
EXPECT_RAISES_WITH_MESSAGE_THAT(
124+
Invalid,
125+
::testing::HasSubstr(
126+
"Invalid: Length of run_ends is greater than the length of values: 3 > 2"),
123127
RunEndEncodedArray::Make(30, run_end_values, ArrayFromJSON(int32(), "[2, 0]")));
124128
}
125129

@@ -387,10 +391,11 @@ TEST_P(TestRunEndEncodedArray, Validate) {
387391

388392
auto empty_run_ends = MakeArray(empty_array->data()->Copy());
389393
empty_run_ends->data()->length = 1;
390-
ASSERT_RAISES_WITH_MESSAGE(Invalid,
391-
"Invalid: Run-end encoded array has non-zero length 1, "
392-
"but run ends array has zero length",
393-
empty_run_ends->Validate());
394+
EXPECT_RAISES_WITH_MESSAGE_THAT(
395+
Invalid,
396+
::testing::HasSubstr("Invalid: Run-end encoded array has non-zero length 1, "
397+
"but run ends array has zero length"),
398+
empty_run_ends->Validate());
394399

395400
auto offset_length_overflow = MakeArray(good_array->data()->Copy());
396401
offset_length_overflow->data()->offset = std::numeric_limits<int64_t>::max();
@@ -408,11 +413,12 @@ TEST_P(TestRunEndEncodedArray, Validate) {
408413
too_large_for_ree16->data()->offset = std::numeric_limits<int16_t>::max();
409414
too_large_for_ree16->data()->length = 1;
410415
if (run_end_type->id() == Type::INT16) {
411-
ASSERT_RAISES_WITH_MESSAGE(
416+
EXPECT_RAISES_WITH_MESSAGE_THAT(
412417
Invalid,
413-
"Invalid: Offset + length of a run-end encoded array must fit in a value"
414-
" of the run end type int16, but offset + length is 32768 while"
415-
" the allowed maximum is 32767",
418+
::testing::HasSubstr(
419+
"Invalid: Offset + length of a run-end encoded array must fit in a value"
420+
" of the run end type int16, but offset + length is 32768 while"
421+
" the allowed maximum is 32767"),
416422
too_large_for_ree16->Validate());
417423
} else {
418424
ASSERT_OK(too_large_for_ree16->ValidateFull());
@@ -423,57 +429,64 @@ TEST_P(TestRunEndEncodedArray, Validate) {
423429
too_large_for_ree32->data()->offset = std::numeric_limits<int32_t>::max();
424430
too_large_for_ree32->data()->length = 1;
425431
if (run_end_type->id() == Type::INT16) {
426-
ASSERT_RAISES_WITH_MESSAGE(
432+
EXPECT_RAISES_WITH_MESSAGE_THAT(
427433
Invalid,
428-
"Invalid: Offset + length of a run-end encoded array must fit in a "
429-
"value of the run end type int16, but offset + length "
430-
"is 2147483648 while the allowed maximum is 32767",
434+
::testing::HasSubstr(
435+
"Invalid: Offset + length of a run-end encoded array must fit in a "
436+
"value of the run end type int16, but offset + length "
437+
"is 2147483648 while the allowed maximum is 32767"),
431438
too_large_for_ree32->Validate());
432439
} else if (run_end_type->id() == Type::INT32) {
433-
ASSERT_RAISES_WITH_MESSAGE(
440+
EXPECT_RAISES_WITH_MESSAGE_THAT(
434441
Invalid,
435-
"Invalid: Offset + length of a run-end encoded array must fit in a "
436-
"value of the run end type int32, but offset + length "
437-
"is 2147483648 while the allowed maximum is 2147483647",
442+
::testing::HasSubstr(
443+
"Invalid: Offset + length of a run-end encoded array must fit in a "
444+
"value of the run end type int32, but offset + length "
445+
"is 2147483648 while the allowed maximum is 2147483647"),
438446
too_large_for_ree32->Validate());
439447
} else {
440448
ASSERT_OK(too_large_for_ree32->ValidateFull());
441449
}
442450

443451
auto too_many_children = MakeArray(good_array->data()->Copy());
444452
too_many_children->data()->child_data.push_back(NULLPTR);
445-
ASSERT_RAISES_WITH_MESSAGE(
453+
EXPECT_RAISES_WITH_MESSAGE_THAT(
446454
Invalid,
447-
std::string("Invalid: Expected 2 child arrays in array of type "
448-
"run_end_encoded<run_ends: ") +
449-
run_end_type->ToString() + ", values: string>, got 3",
455+
::testing::HasSubstr(
456+
std::string("Invalid: Expected 2 child arrays in array of type "
457+
"run_end_encoded<run_ends: ") +
458+
run_end_type->ToString() + ", values: string>, got 3"),
450459
too_many_children->Validate());
451460

452461
auto run_ends_nullptr = MakeArray(good_array->data()->Copy());
453462
run_ends_nullptr->data()->child_data[0] = NULLPTR;
454-
ASSERT_RAISES_WITH_MESSAGE(Invalid, "Invalid: Run ends array is null pointer",
455-
run_ends_nullptr->Validate());
463+
EXPECT_RAISES_WITH_MESSAGE_THAT(
464+
Invalid, ::testing::HasSubstr("Invalid: Run ends array is null pointer"),
465+
run_ends_nullptr->Validate());
456466

457467
auto values_nullptr = MakeArray(good_array->data()->Copy());
458468
values_nullptr->data()->child_data[1] = NULLPTR;
459-
ASSERT_RAISES_WITH_MESSAGE(Invalid, "Invalid: Values array is null pointer",
460-
values_nullptr->Validate());
469+
EXPECT_RAISES_WITH_MESSAGE_THAT(
470+
Invalid, ::testing::HasSubstr("Invalid: Values array is null pointer"),
471+
values_nullptr->Validate());
461472

462473
auto run_ends_string = MakeArray(good_array->data()->Copy());
463474
run_ends_string->data()->child_data[0] = values->data();
464-
ASSERT_RAISES_WITH_MESSAGE(
475+
EXPECT_RAISES_WITH_MESSAGE_THAT(
465476
Invalid,
466-
std::string("Invalid: Run ends array of run_end_encoded<run_ends: ") +
477+
::testing::HasSubstr(
478+
std::string("Invalid: Run ends array of run_end_encoded<run_ends: ") +
467479
run_end_type->ToString() + ", values: string> must be " +
468-
run_end_type->ToString() + ", but run end type is string",
480+
run_end_type->ToString() + ", but run end type is string"),
469481
run_ends_string->Validate());
470482

471483
auto wrong_type = MakeArray(good_array->data()->Copy());
472484
wrong_type->data()->type = run_end_encoded(run_end_type, uint16());
473-
ASSERT_RAISES_WITH_MESSAGE(Invalid,
474-
"Invalid: Parent type says this array encodes uint16 "
475-
"values, but value type is string",
476-
wrong_type->Validate());
485+
EXPECT_RAISES_WITH_MESSAGE_THAT(
486+
Invalid,
487+
::testing::HasSubstr("Invalid: Parent type says this array encodes uint16 "
488+
"values, but value type is string"),
489+
wrong_type->Validate());
477490

478491
{
479492
// malformed_array has its buffers deallocated after the RunEndEncodedArray is
@@ -506,37 +519,42 @@ TEST_P(TestRunEndEncodedArray, Validate) {
506519
ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> run_end_zero_array,
507520
RunEndEncodedArray::Make(40, run_ends_with_zero, values));
508521
ASSERT_OK(run_end_zero_array->Validate());
509-
ASSERT_RAISES_WITH_MESSAGE(
510-
Invalid, "Invalid: All run ends must be greater than 0 but the first run end is 0",
522+
EXPECT_RAISES_WITH_MESSAGE_THAT(
523+
Invalid,
524+
::testing::HasSubstr(
525+
"Invalid: All run ends must be greater than 0 but the first run end is 0"),
511526
run_end_zero_array->ValidateFull());
512527
// The whole run ends array has to be valid even if the parent is sliced
513528
run_end_zero_array = run_end_zero_array->Slice(30, 0);
514-
ASSERT_RAISES_WITH_MESSAGE(
515-
Invalid, "Invalid: All run ends must be greater than 0 but the first run end is 0",
529+
EXPECT_RAISES_WITH_MESSAGE_THAT(
530+
Invalid,
531+
::testing::HasSubstr(
532+
"Invalid: All run ends must be greater than 0 but the first run end is 0"),
516533
run_end_zero_array->ValidateFull());
517534

518535
ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> run_ends_not_ordered_array,
519536
RunEndEncodedArray::Make(40, run_ends_not_ordered, values));
520537
ASSERT_OK(run_ends_not_ordered_array->Validate());
521-
ASSERT_RAISES_WITH_MESSAGE(
538+
EXPECT_RAISES_WITH_MESSAGE_THAT(
522539
Invalid,
523-
"Invalid: Every run end must be strictly greater than the previous run end, but "
524-
"run_ends[3] is 40 and run_ends[2] is 40",
540+
::testing::HasSubstr("Invalid: Every run end must be strictly greater than the "
541+
"previous run end, but "
542+
"run_ends[3] is 40 and run_ends[2] is 40"),
525543
run_ends_not_ordered_array->ValidateFull());
526544
// The whole run ends array has to be valid even if the parent is sliced
527545
run_ends_not_ordered_array = run_ends_not_ordered_array->Slice(30, 0);
528-
ASSERT_RAISES_WITH_MESSAGE(
546+
EXPECT_RAISES_WITH_MESSAGE_THAT(
529547
Invalid,
530-
"Invalid: Every run end must be strictly greater than the previous run end, but "
531-
"run_ends[3] is 40 and run_ends[2] is 40",
548+
::testing::HasSubstr("Invalid: Every run end must be strictly greater than the "
549+
"previous run end, but "
550+
"run_ends[3] is 40 and run_ends[2] is 40"),
532551
run_ends_not_ordered_array->ValidateFull());
533552

534-
ASSERT_OK_AND_ASSIGN(auto run_ends_too_low_array,
535-
RunEndEncodedArray::Make(40, run_ends_too_low, values));
536-
ASSERT_RAISES_WITH_MESSAGE(Invalid,
537-
"Invalid: Last run end is 39 but it should match 40"
538-
" (offset: 0, length: 40)",
539-
run_ends_too_low_array->Validate());
553+
EXPECT_RAISES_WITH_MESSAGE_THAT(
554+
Invalid,
555+
::testing::HasSubstr("Invalid: Last run end is 39 but it should match 40"
556+
" (offset: 0, length: 40)"),
557+
RunEndEncodedArray::Make(40, run_ends_too_low, values));
540558
}
541559

542560
TEST_P(TestRunEndEncodedArray, Compare) {
@@ -613,12 +631,13 @@ TEST_P(TestRunEndEncodedArray, Concatenate) {
613631
default_memory_pool()));
614632
ASSERT_ARRAYS_EQUAL(*empty_array, *result);
615633

616-
ASSERT_RAISES_WITH_MESSAGE(
634+
EXPECT_RAISES_WITH_MESSAGE_THAT(
617635
Invalid,
618-
std::string("Invalid: arrays to be concatenated must be identically typed, but "
619-
"run_end_encoded<run_ends: ") +
636+
::testing::HasSubstr(
637+
std::string("Invalid: arrays to be concatenated must be identically typed, but "
638+
"run_end_encoded<run_ends: ") +
620639
run_end_type->ToString() + ", values: int32> and run_end_encoded<run_ends: " +
621-
run_end_type->ToString() + ", values: string> were encountered.",
640+
run_end_type->ToString() + ", values: string> were encountered."),
622641
Concatenate({int32_array, string_array}, default_memory_pool()));
623642
}
624643

cpp/src/arrow/array/validate.cc

Lines changed: 12 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -639,70 +639,27 @@ struct ValidateArrayImpl {
639639

640640
template <typename RunEndCType>
641641
Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
642-
// Overflow was already checked at this point
643-
if (data.offset + data.length > std::numeric_limits<RunEndCType>::max()) {
644-
return Status::Invalid(
645-
"Offset + length of a run-end encoded array must fit in a value"
646-
" of the run end type ",
647-
*type.run_end_type(), ", but offset + length is ", data.offset + data.length,
648-
" while the allowed maximum is ", std::numeric_limits<RunEndCType>::max());
649-
}
650-
if (!data.child_data[0]) {
651-
return Status::Invalid("Run ends array is null pointer");
652-
}
653-
if (!data.child_data[1]) {
654-
return Status::Invalid("Values array is null pointer");
655-
}
656-
const ArrayData& run_ends_data = *data.child_data[0];
657-
const ArrayData& values_data = *data.child_data[1];
658-
if (*run_ends_data.type != *type.run_end_type()) {
659-
return Status::Invalid("Run ends array of ", type, " must be ",
660-
*type.run_end_type(), ", but run end type is ",
661-
*run_ends_data.type);
662-
}
663-
if (*values_data.type != *type.value_type()) {
664-
return Status::Invalid("Parent type says this array encodes ", *type.value_type(),
665-
" values, but value type is ", *values_data.type);
666-
}
667-
const Status run_ends_valid = RecurseInto(run_ends_data);
642+
const auto& run_ends_data = data.child_data[0];
643+
const auto& values_data = data.child_data[1];
644+
RETURN_NOT_OK(ree_util::ValidateRunEndEncodedChildren(
645+
type, data.length, run_ends_data, values_data, data.GetNullCount(), data.offset));
646+
647+
const Status run_ends_valid = RecurseInto(*run_ends_data);
668648
if (!run_ends_valid.ok()) {
669649
return Status::Invalid("Run ends array invalid: ", run_ends_valid.message());
670650
}
671-
const Status values_valid = RecurseInto(values_data);
651+
const Status values_valid = RecurseInto(*values_data);
672652
if (!values_valid.ok()) {
673653
return Status::Invalid("Values array invalid: ", values_valid.message());
674654
}
675-
if (data.GetNullCount() != 0) {
676-
return Status::Invalid("Null count must be 0 for run-end encoded array, but is ",
677-
data.GetNullCount());
678-
}
679-
if (run_ends_data.GetNullCount() != 0) {
680-
return Status::Invalid("Null count must be 0 for run ends array, but is ",
681-
run_ends_data.GetNullCount());
682-
}
683-
if (run_ends_data.length > values_data.length) {
684-
return Status::Invalid("Length of run_ends is greater than the length of values: ",
685-
run_ends_data.length, " > ", values_data.length);
686-
}
687-
if (run_ends_data.length == 0) {
688-
if (data.length == 0) {
689-
return Status::OK();
690-
}
691-
return Status::Invalid("Run-end encoded array has non-zero length ", data.length,
692-
", but run ends array has zero length");
693-
}
694-
if (!run_ends_data.buffers[1]->is_cpu()) {
655+
656+
if (run_ends_data->length == 0) {
695657
return Status::OK();
696658
}
697-
ArraySpan span(data);
698-
const auto* run_ends = ree_util::RunEnds<RunEndCType>(span);
699-
// The last run-end is the logical offset + the logical length.
700-
if (run_ends[run_ends_data.length - 1] < data.offset + data.length) {
701-
return Status::Invalid("Last run end is ", run_ends[run_ends_data.length - 1],
702-
" but it should match ", data.offset + data.length,
703-
" (offset: ", data.offset, ", length: ", data.length, ")");
704-
}
659+
705660
if (full_validation) {
661+
ArraySpan span(data);
662+
const auto* run_ends = ree_util::RunEnds<RunEndCType>(span);
706663
const int64_t run_ends_length = ree_util::RunEndsArray(span).length;
707664
if (run_ends[0] < 1) {
708665
return Status::Invalid(

0 commit comments

Comments
 (0)