Skip to content

Commit f19d652

Browse files
committed
add test coverage for read_file_slice_from_paths API
1 parent 98dbdc7 commit f19d652

File tree

2 files changed

+79
-24
lines changed

2 files changed

+79
-24
lines changed

crates/core/src/file_group/reader.rs

Lines changed: 65 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,7 @@ mod tests {
486486
let base_uri = get_base_uri_with_valid_props_minimum();
487487
let reader = FileGroupReader::new_with_options(&base_uri, empty_options())?;
488488

489+
// Test with actual test files and empty log files - should trigger base_file_only logic
489490
let base_file_path = TEST_SAMPLE_BASE_FILE;
490491
let log_file_paths = vec![];
491492

@@ -507,22 +508,26 @@ mod tests {
507508
}
508509

509510
#[test]
510-
fn test_read_file_slice_from_paths_with_log_files() -> Result<()> {
511+
fn test_read_file_slice_from_paths_read_optimized_mode() -> Result<()> {
511512
let base_uri = get_base_uri_with_valid_props_minimum();
512-
let reader = FileGroupReader::new_with_options(&base_uri, empty_options())?;
513+
let reader = FileGroupReader::new_with_options(
514+
&base_uri,
515+
[(HudiReadConfig::UseReadOptimizedMode.as_ref(), "true")],
516+
)?;
513517

514518
let base_file_path = TEST_SAMPLE_BASE_FILE;
515519
let log_file_paths = vec![TEST_SAMPLE_LOG_FILE.to_string()];
516520

517521
let result = reader.read_file_slice_from_paths_blocking(base_file_path, log_file_paths);
518522

519-
// The actual file reading might fail due to missing test data, which is expected
523+
// In read-optimized mode, log files should be ignored
524+
// This should behave the same as read_file_slice_by_base_file_path
520525
match result {
521-
Ok(_batch) => {
522-
// Test passes if we get a valid batch
526+
Ok(_) => {
527+
// Test passes if we get a result - the method correctly ignored log files
523528
}
524529
Err(e) => {
525-
// Expected for missing test data - verify it's a storage/file not found error
530+
// Expected for missing test data
526531
let error_msg = e.to_string();
527532
assert!(
528533
error_msg.contains("not found") || error_msg.contains("No such file"),
@@ -536,26 +541,22 @@ mod tests {
536541
}
537542

538543
#[test]
539-
fn test_read_file_slice_from_paths_read_optimized_mode() -> Result<()> {
544+
fn test_read_file_slice_from_paths_with_log_files() -> Result<()> {
540545
let base_uri = get_base_uri_with_valid_props_minimum();
541-
let reader = FileGroupReader::new_with_options(
542-
&base_uri,
543-
[(HudiReadConfig::UseReadOptimizedMode.as_ref(), "true")],
544-
)?;
546+
let reader = FileGroupReader::new_with_options(&base_uri, empty_options())?;
545547

546548
let base_file_path = TEST_SAMPLE_BASE_FILE;
547549
let log_file_paths = vec![TEST_SAMPLE_LOG_FILE.to_string()];
548550

549551
let result = reader.read_file_slice_from_paths_blocking(base_file_path, log_file_paths);
550552

551-
// In read-optimized mode, log files should be ignored
552-
// This should behave the same as read_file_slice_by_base_file_path
553+
// The actual file reading might fail due to missing test data, which is expected
553554
match result {
554-
Ok(_) => {
555-
// Test passes if we get a result - the method correctly ignored log files
555+
Ok(_batch) => {
556+
// Test passes if we get a valid batch
556557
}
557558
Err(e) => {
558-
// Expected for missing test data
559+
// Expected for missing test data - verify it's a storage/file not found error
559560
let error_msg = e.to_string();
560561
assert!(
561562
error_msg.contains("not found") || error_msg.contains("No such file"),
@@ -613,4 +614,52 @@ mod tests {
613614

614615
Ok(())
615616
}
617+
618+
#[test]
619+
fn test_read_file_slice_from_paths_read_optimized_forces_base_only() -> Result<()> {
620+
let base_uri = get_base_uri_with_valid_props_minimum();
621+
let reader = FileGroupReader::new_with_options(
622+
&base_uri,
623+
[(HudiReadConfig::UseReadOptimizedMode.as_ref(), "true")],
624+
)?;
625+
626+
let base_file_path = "test.parquet";
627+
// Even with log files provided, read-optimized mode should ignore them
628+
let log_paths = vec!["log1.log".to_string(), "log2.log".to_string()];
629+
630+
let result = reader.read_file_slice_from_paths_blocking(base_file_path, log_paths);
631+
632+
assert!(result.is_err());
633+
let error_msg = result.unwrap_err().to_string();
634+
assert!(error_msg.contains("Failed to read path"));
635+
636+
Ok(())
637+
}
638+
639+
#[test]
640+
fn test_read_file_slice_from_paths_with_non_empty_logs_attempts_merge() -> Result<()> {
641+
let base_uri = get_base_uri_with_valid_props_minimum();
642+
// Explicitly disable read-optimized mode to force merge path
643+
let reader = FileGroupReader::new_with_options(
644+
&base_uri,
645+
[(HudiReadConfig::UseReadOptimizedMode.as_ref(), "false")],
646+
)?;
647+
648+
let base_file_path = "test.parquet";
649+
let log_paths = vec!["log1.log".to_string()];
650+
651+
let result = reader.read_file_slice_from_paths_blocking(base_file_path, log_paths);
652+
653+
// We expect this to fail, but it should exercise the merge logic path
654+
assert!(result.is_err());
655+
// The error could be either base file not found or log scanner issues
656+
let error_msg = result.unwrap_err().to_string();
657+
assert!(
658+
error_msg.contains("Failed to read path")
659+
|| error_msg.contains("not found")
660+
|| error_msg.contains("No such file")
661+
);
662+
663+
Ok(())
664+
}
616665
}

python/tests/test_file_group_read.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@
2020
from hudi import HudiFileGroupReader
2121

2222
TEST_SAMPLE_BASE_FILE = "san_francisco/780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_0-8-0_20240402123035233.parquet"
23-
TEST_SAMPLE_LOG_FILE = ".780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_20240402123035233.log.1_0-8-0"
23+
TEST_SAMPLE_LOG_FILE = (
24+
".780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_20240402123035233.log.1_0-8-0"
25+
)
26+
2427

2528
def test_file_group_api_read_file_slice(get_sample_table):
2629
table_path = get_sample_table
@@ -45,19 +48,22 @@ def test_file_group_api_read_file_slice_from_paths(get_sample_table):
4548

4649
batch = file_group_reader.read_file_slice_from_paths(TEST_SAMPLE_BASE_FILE, [])
4750
assert batch.num_rows > 0
48-
49-
batch_original = file_group_reader.read_file_slice_by_base_file_path(TEST_SAMPLE_BASE_FILE)
51+
52+
batch_original = file_group_reader.read_file_slice_by_base_file_path(
53+
TEST_SAMPLE_BASE_FILE
54+
)
5055
assert batch.num_rows == batch_original.num_rows
5156
assert batch.num_columns == batch_original.num_columns
52-
57+
5358
t_new = pa.Table.from_batches([batch]).select([0, 5, 6, 9]).sort_by("ts")
54-
t_original = pa.Table.from_batches([batch_original]).select([0, 5, 6, 9]).sort_by("ts")
59+
t_original = (
60+
pa.Table.from_batches([batch_original]).select([0, 5, 6, 9]).sort_by("ts")
61+
)
5562
assert t_new.to_pylist() == t_original.to_pylist()
56-
63+
5764
try:
5865
batch_with_logs = file_group_reader.read_file_slice_from_paths(
59-
TEST_SAMPLE_BASE_FILE,
60-
[TEST_SAMPLE_LOG_FILE]
66+
TEST_SAMPLE_BASE_FILE, [TEST_SAMPLE_LOG_FILE]
6167
)
6268
assert batch_with_logs.num_rows >= 0
6369
except Exception:

0 commit comments

Comments
 (0)