diff --git a/Cargo.lock b/Cargo.lock index 19f7df8adb2c0..a4dd71c594ce5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5163,6 +5163,7 @@ dependencies = [ "serde_json", "serde_stacker", "serde_urlencoded", + "serde_yaml", "sha2", "socket2", "sqlx", diff --git a/Cargo.toml b/Cargo.toml index c922a010b1424..a48f578dbba48 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -457,6 +457,7 @@ serde_stacker = { version = "0.1" } serde_test = "1.0" serde_urlencoded = "0.7.1" serde_with = { version = "3.8.1" } +serde_yaml = { version = "0.9.34" } serfig = "0.1.0" sha1 = "0.10.5" sha2 = "0.10.8" diff --git a/src/query/catalog/src/lib.rs b/src/query/catalog/src/lib.rs index 30ea1b10ef841..b9ea1b2d1edc8 100644 --- a/src/query/catalog/src/lib.rs +++ b/src/query/catalog/src/lib.rs @@ -31,3 +31,6 @@ pub mod table_args; pub mod table_context; pub mod table_function; pub mod table_with_options; + +pub use statistics::BasicColumnStatistics; +pub use table::TableStatistics; diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index b93c2e7722a48..c8697928166bc 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -190,6 +190,8 @@ mysql_async = { workspace = true } p256 = { workspace = true } pretty_assertions = { workspace = true } reqwest = { workspace = true } +serde_json.workspace = true +serde_yaml = { workspace = true } temp-env = { workspace = true } tempfile = { workspace = true } tower = { workspace = true } diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/README.md b/src/query/service/tests/it/sql/planner/optimizer/data/README.md new file mode 100644 index 0000000000000..e9996f87c3c29 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/README.md @@ -0,0 +1,67 @@ +# TPC-DS Optimizer Test Data + +This directory contains test data for TPC-DS optimizer tests. The tests are structured as follows: + +## Directory Structure + +``` +data +├── tables/ # SQL table definitions +└── yaml/ # YAML test case definitions +``` + +## YAML Test Case Format + +Each test case is defined in a YAML file with the following structure: + +```yaml +name: "Q3" # Test case name +description: "Test description" # Optional description + +sql: | # SQL query to test + SELECT ... + +table_statistics: # Table statistics + table_name: + num_rows: 1000 + data_size: 102400 + data_size_compressed: 51200 + index_size: 20480 + number_of_blocks: 10 + number_of_segments: 2 + +column_statistics: # Column statistics + table_name.column_name: + min: 1990 # Min value (can be number or string) + max: 2000 # Max value (can be number or string) + ndv: 10 # Number of distinct values + null_count: 0 # Number of null values + +raw_plan: | # Expected raw plan + ... + +optimized_plan: | # Expected optimized plan + ... + +snow_plan: | # Optional expected snowflake plan + ... +``` + +## Table Definitions + +Table definitions are stored in SQL files in the `tables` directory. Each file contains a `CREATE TABLE` statement for a specific table used in the tests. + +## Adding New Tests + +To add a new test case: + +1. Create a new YAML file in the `yaml` directory with the test case definition. +2. If the test uses new tables, add the table definitions to the `tables` directory. +3. The test runner will automatically discover and run all test cases in the `yaml` directory. + +## Updating Existing Tests + +If the expected output of a test changes (e.g., due to optimizer improvements): + +1. Run the test to see the actual output. +2. Update the `raw_plan`, `optimized_plan`, or `snow_plan` field in the YAML file to match the actual output. diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/call_center.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/call_center.sql new file mode 100644 index 0000000000000..62829b780e0e1 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/call_center.sql @@ -0,0 +1,34 @@ +CREATE OR REPLACE TABLE call_center +( + cc_call_center_sk integer , + cc_call_center_id char(16) , + cc_rec_start_date date null, + cc_rec_end_date date null, + cc_closed_date_sk integer null, + cc_open_date_sk integer null, + cc_name varchar(50) null, + cc_class varchar(50) null, + cc_employees integer null, + cc_sq_ft integer null, + cc_hours char(20) null, + cc_manager varchar(40) null, + cc_mkt_id integer null, + cc_mkt_class char(50) null, + cc_mkt_desc varchar(100) null, + cc_market_manager varchar(40) null, + cc_division integer null, + cc_division_name varchar(50) null, + cc_company integer null, + cc_company_name char(50) null, + cc_street_number char(10) null, + cc_street_name varchar(60) null, + cc_street_type char(15) null, + cc_suite_number char(10) null, + cc_city varchar(60) null, + cc_county varchar(30) null, + cc_state char(2) null, + cc_zip char(10) null, + cc_country varchar(20) null, + cc_gmt_offset decimal(5,2) null, + cc_tax_percentage decimal(5,2) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/catalog_page.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/catalog_page.sql new file mode 100644 index 0000000000000..93ceeb77d48cd --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/catalog_page.sql @@ -0,0 +1,12 @@ +CREATE OR REPLACE TABLE catalog_page +( + cp_catalog_page_sk integer , + cp_catalog_page_id char(16) , + cp_start_date_sk integer null, + cp_end_date_sk integer null, + cp_department varchar(50) null, + cp_catalog_number integer null, + cp_catalog_page_number integer null, + cp_description varchar(100) null, + cp_type varchar(100) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/catalog_returns.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/catalog_returns.sql new file mode 100644 index 0000000000000..92381b5d0720b --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/catalog_returns.sql @@ -0,0 +1,30 @@ +CREATE OR REPLACE TABLE catalog_returns +( + cr_returned_date_sk integer null, + cr_returned_time_sk integer null, + cr_item_sk integer , + cr_refunded_customer_sk integer null, + cr_refunded_cdemo_sk integer null, + cr_refunded_hdemo_sk integer null, + cr_refunded_addr_sk integer null, + cr_returning_customer_sk integer null, + cr_returning_cdemo_sk integer null, + cr_returning_hdemo_sk integer null, + cr_returning_addr_sk integer null, + cr_call_center_sk integer null, + cr_catalog_page_sk integer null, + cr_ship_mode_sk integer null, + cr_warehouse_sk integer null, + cr_reason_sk integer null, + cr_order_number integer , + cr_return_quantity integer null, + cr_return_amount decimal(7,2) null, + cr_return_tax decimal(7,2) null, + cr_return_amt_inc_tax decimal(7,2) null, + cr_fee decimal(7,2) null, + cr_return_ship_cost decimal(7,2) null, + cr_refunded_cash decimal(7,2) null, + cr_reversed_charge decimal(7,2) null, + cr_store_credit decimal(7,2) null, + cr_net_loss decimal(7,2) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/customer.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/customer.sql new file mode 100644 index 0000000000000..4cfed76a12c23 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/customer.sql @@ -0,0 +1,21 @@ +CREATE OR REPLACE TABLE customer +( + c_customer_sk integer , + c_customer_id char(16) , + c_current_cdemo_sk integer null, + c_current_hdemo_sk integer null, + c_current_addr_sk integer null, + c_first_shipto_date_sk integer null, + c_first_sales_date_sk integer null, + c_salutation char(10) null, + c_first_name char(20) null, + c_last_name char(30) null, + c_preferred_cust_flag char(1) null, + c_birth_day integer null, + c_birth_month integer null, + c_birth_year integer null, + c_birth_country varchar(20) null, + c_login char(13) null, + c_email_address char(50) null, + c_last_review_date_sk integer null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/customer_address.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/customer_address.sql new file mode 100644 index 0000000000000..fe4593096b308 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/customer_address.sql @@ -0,0 +1,16 @@ +CREATE OR REPLACE TABLE customer_address +( + ca_address_sk integer , + ca_address_id char(16) , + ca_street_number char(10) null, + ca_street_name varchar(60) null, + ca_street_type char(15) null, + ca_suite_number char(10) null, + ca_city varchar(60) null, + ca_county varchar(30) null, + ca_state char(2) null, + ca_zip char(10) null, + ca_country varchar(20) null, + ca_gmt_offset decimal(5,2) null, + ca_location_type char(20) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/customer_demographics.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/customer_demographics.sql new file mode 100644 index 0000000000000..6f4afdbabb034 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/customer_demographics.sql @@ -0,0 +1,12 @@ +CREATE OR REPLACE TABLE customer_demographics +( + cd_demo_sk integer , + cd_gender char(1) null, + cd_marital_status char(1) null, + cd_education_status char(20) null, + cd_purchase_estimate integer null, + cd_credit_rating char(10) null, + cd_dep_count integer null, + cd_dep_employed_count integer null, + cd_dep_college_count integer null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/date_dim.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/date_dim.sql new file mode 100644 index 0000000000000..4f6f74974e949 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/date_dim.sql @@ -0,0 +1,30 @@ +CREATE OR REPLACE TABLE date_dim ( + d_date_sk INTEGER, + d_date_id VARCHAR, + d_date DATE, + d_month_seq INTEGER, + d_week_seq INTEGER, + d_quarter_seq INTEGER, + d_year INTEGER, + d_dow INTEGER, + d_moy INTEGER, + d_dom INTEGER, + d_qoy INTEGER, + d_fy_year INTEGER, + d_fy_quarter_seq INTEGER, + d_fy_week_seq INTEGER, + d_day_name VARCHAR, + d_quarter_name VARCHAR, + d_holiday CHAR(1), + d_weekend CHAR(1), + d_following_holiday CHAR(1), + d_first_dom INTEGER, + d_last_dom INTEGER, + d_same_day_ly INTEGER, + d_same_day_lq INTEGER, + d_current_day CHAR(1), + d_current_week CHAR(1), + d_current_month CHAR(1), + d_current_quarter CHAR(1), + d_current_year CHAR(1) +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/household_demographics.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/household_demographics.sql new file mode 100644 index 0000000000000..be3e25725fbaf --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/household_demographics.sql @@ -0,0 +1,8 @@ +CREATE OR REPLACE TABLE household_demographics +( + hd_demo_sk integer , + hd_income_band_sk integer null, + hd_buy_potential char(15) null, + hd_dep_count integer null, + hd_vehicle_count integer null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/income_band.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/income_band.sql new file mode 100644 index 0000000000000..48a1cbbbe21af --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/income_band.sql @@ -0,0 +1,6 @@ +CREATE OR REPLACE TABLE income_band +( + ib_income_band_sk integer , + ib_lower_bound integer null, + ib_upper_bound integer null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/inventory.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/inventory.sql new file mode 100644 index 0000000000000..d9a7c2aca1fd3 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/inventory.sql @@ -0,0 +1,7 @@ +CREATE OR REPLACE TABLE inventory +( + inv_date_sk integer , + inv_item_sk integer , + inv_warehouse_sk integer , + inv_quantity_on_hand integer null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/item.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/item.sql new file mode 100644 index 0000000000000..1575cd622689d --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/item.sql @@ -0,0 +1,24 @@ +CREATE OR REPLACE TABLE item ( + i_item_sk INTEGER, + i_item_id VARCHAR, + i_rec_start_date DATE, + i_rec_end_date DATE, + i_item_desc VARCHAR, + i_current_price DECIMAL(7,2), + i_wholesale_cost DECIMAL(7,2), + i_brand_id INTEGER, + i_brand VARCHAR, + i_class_id INTEGER, + i_class VARCHAR, + i_category_id INTEGER, + i_category VARCHAR, + i_manufact_id INTEGER, + i_manufact VARCHAR, + i_size VARCHAR, + i_formulation VARCHAR, + i_color VARCHAR, + i_units VARCHAR, + i_container VARCHAR, + i_manager_id INTEGER, + i_product_name VARCHAR +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/promotion.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/promotion.sql new file mode 100644 index 0000000000000..88a15e7c80ade --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/promotion.sql @@ -0,0 +1,22 @@ +CREATE OR REPLACE TABLE promotion +( + p_promo_sk integer , + p_promo_id char(16) , + p_start_date_sk integer null, + p_end_date_sk integer null, + p_item_sk integer null, + p_cost decimal(15,2) null, + p_response_target integer null, + p_promo_name char(50) null, + p_channel_dmail char(1) null, + p_channel_email char(1) null, + p_channel_catalog char(1) null, + p_channel_tv char(1) null, + p_channel_radio char(1) null, + p_channel_press char(1) null, + p_channel_event char(1) null, + p_channel_demo char(1) null, + p_channel_details varchar(100) null, + p_purpose char(15) null, + p_discount_active char(1) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/reason.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/reason.sql new file mode 100644 index 0000000000000..8b0aaecdeeb47 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/reason.sql @@ -0,0 +1,6 @@ +CREATE OR REPLACE TABLE reason +( + r_reason_sk integer , + r_reason_id char(16) , + r_reason_desc char(100) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/ship_mode.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/ship_mode.sql new file mode 100644 index 0000000000000..d10c8f47eeaff --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/ship_mode.sql @@ -0,0 +1,9 @@ +CREATE OR REPLACE TABLE ship_mode +( + sm_ship_mode_sk integer , + sm_ship_mode_id char(16) , + sm_type char(30) null, + sm_code char(10) null, + sm_carrier char(20) null, + sm_contract char(20) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/store.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/store.sql new file mode 100644 index 0000000000000..e7023ba0ba48b --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/store.sql @@ -0,0 +1,32 @@ +CREATE OR REPLACE TABLE store +( + s_store_sk integer , + s_store_id char(16) , + s_rec_start_date date null, + s_rec_end_date date null, + s_closed_date_sk integer null, + s_store_name varchar(50) null, + s_number_employees integer null, + s_floor_space integer null, + s_hours char(20) null, + s_manager varchar(40) null, + s_market_id integer null, + s_geography_class varchar(100) null, + s_market_desc varchar(100) null, + s_market_manager varchar(40) null, + s_division_id integer null, + s_division_name varchar(50) null, + s_company_id integer null, + s_company_name varchar(50) null, + s_street_number varchar(10) null, + s_street_name varchar(60) null, + s_street_type char(15) null, + s_suite_number char(10) null, + s_city varchar(60) null, + s_county varchar(30) null, + s_state char(2) null, + s_zip char(10) null, + s_country varchar(20) null, + s_gmt_offset decimal(5,2) null, + s_tax_precentage decimal(5,2) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/store_returns.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/store_returns.sql new file mode 100644 index 0000000000000..f9ed79b39bd9e --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/store_returns.sql @@ -0,0 +1,23 @@ +CREATE OR REPLACE TABLE store_returns +( + sr_returned_date_sk integer null, + sr_return_time_sk integer null, + sr_item_sk integer , + sr_customer_sk integer null, + sr_cdemo_sk integer null, + sr_hdemo_sk integer null, + sr_addr_sk integer null, + sr_store_sk integer null, + sr_reason_sk integer null, + sr_ticket_number integer , + sr_return_quantity integer null, + sr_return_amt decimal(7,2) null, + sr_return_tax decimal(7,2) null, + sr_return_amt_inc_tax decimal(7,2) null, + sr_fee decimal(7,2) null, + sr_return_ship_cost decimal(7,2) null, + sr_refunded_cash decimal(7,2) null, + sr_reversed_charge decimal(7,2) null, + sr_store_credit decimal(7,2) null, + sr_net_loss decimal(7,2) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/store_sales.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/store_sales.sql new file mode 100644 index 0000000000000..2ba5816d7854e --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/store_sales.sql @@ -0,0 +1,25 @@ +CREATE OR REPLACE TABLE store_sales ( + ss_sold_date_sk INTEGER, + ss_sold_time_sk INTEGER, + ss_item_sk INTEGER, + ss_customer_sk INTEGER, + ss_cdemo_sk INTEGER, + ss_hdemo_sk INTEGER, + ss_addr_sk INTEGER, + ss_store_sk INTEGER, + ss_promo_sk INTEGER, + ss_ticket_number INTEGER, + ss_quantity INTEGER, + ss_wholesale_cost DECIMAL(7,2), + ss_list_price DECIMAL(7,2), + ss_sales_price DECIMAL(7,2), + ss_ext_discount_amt DECIMAL(7,2), + ss_ext_sales_price DECIMAL(7,2), + ss_ext_wholesale_cost DECIMAL(7,2), + ss_ext_list_price DECIMAL(7,2), + ss_ext_tax DECIMAL(7,2), + ss_coupon_amt DECIMAL(7,2), + ss_net_paid DECIMAL(7,2), + ss_net_paid_inc_tax DECIMAL(7,2), + ss_net_profit DECIMAL(7,2) +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/time_dim.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/time_dim.sql new file mode 100644 index 0000000000000..1a49c9dfc8361 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/time_dim.sql @@ -0,0 +1,13 @@ +CREATE OR REPLACE TABLE time_dim +( + t_time_sk integer , + t_time_id char(16) , + t_time integer null, + t_hour integer null, + t_minute integer null, + t_second integer null, + t_am_pm char(2) null, + t_shift char(20) null, + t_sub_shift char(20) null, + t_meal_time char(20) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/warehouse.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/warehouse.sql new file mode 100644 index 0000000000000..fcf565d05ade7 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/warehouse.sql @@ -0,0 +1,17 @@ +CREATE OR REPLACE TABLE warehouse +( + w_warehouse_sk integer , + w_warehouse_id char(16) , + w_warehouse_name varchar(20) null, + w_warehouse_sq_ft integer null, + w_street_number char(10) null, + w_street_name varchar(60) null, + w_street_type char(15) null, + w_suite_number char(10) null, + w_city varchar(60) null, + w_county varchar(30) null, + w_state char(2) null, + w_zip char(10) null, + w_country varchar(20) null, + w_gmt_offset decimal(5,2) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/web_page.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/web_page.sql new file mode 100644 index 0000000000000..8919340761ca4 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/web_page.sql @@ -0,0 +1,17 @@ +CREATE OR REPLACE TABLE web_page +( + wp_web_page_sk integer , + wp_web_page_id char(16) , + wp_rec_start_date date null, + wp_rec_end_date date null, + wp_creation_date_sk integer null, + wp_access_date_sk integer null, + wp_autogen_flag char(1) null, + wp_customer_sk integer null, + wp_url varchar(100) null, + wp_type char(50) null, + wp_char_count integer null, + wp_link_count integer null, + wp_image_count integer null, + wp_max_ad_count integer null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/web_returns.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/web_returns.sql new file mode 100644 index 0000000000000..ddb28846de818 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/web_returns.sql @@ -0,0 +1,27 @@ +CREATE OR REPLACE TABLE web_returns +( + wr_returned_date_sk integer null, + wr_returned_time_sk integer null, + wr_item_sk integer , + wr_refunded_customer_sk integer null, + wr_refunded_cdemo_sk integer null, + wr_refunded_hdemo_sk integer null, + wr_refunded_addr_sk integer null, + wr_returning_customer_sk integer null, + wr_returning_cdemo_sk integer null, + wr_returning_hdemo_sk integer null, + wr_returning_addr_sk integer null, + wr_web_page_sk integer null, + wr_reason_sk integer null, + wr_order_number integer , + wr_return_quantity integer null, + wr_return_amt decimal(7,2) null, + wr_return_tax decimal(7,2) null, + wr_return_amt_inc_tax decimal(7,2) null, + wr_fee decimal(7,2) null, + wr_return_ship_cost decimal(7,2) null, + wr_refunded_cash decimal(7,2) null, + wr_reversed_charge decimal(7,2) null, + wr_account_credit decimal(7,2) null, + wr_net_loss decimal(7,2) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/tables/web_site.sql b/src/query/service/tests/it/sql/planner/optimizer/data/tables/web_site.sql new file mode 100644 index 0000000000000..dbd32679eba47 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/tables/web_site.sql @@ -0,0 +1,29 @@ +CREATE OR REPLACE TABLE web_site +( + web_site_sk integer , + web_site_id char(16) , + web_rec_start_date date null, + web_rec_end_date date null, + web_name varchar(50) null, + web_open_date_sk integer null, + web_close_date_sk integer null, + web_class varchar(50) null, + web_manager varchar(40) null, + web_mkt_id integer null, + web_mkt_class varchar(50) null, + web_mkt_desc varchar(100) null, + web_market_manager varchar(40) null, + web_company_id integer null, + web_company_name char(50) null, + web_street_number char(10) null, + web_street_name varchar(60) null, + web_street_type char(15) null, + web_suite_number char(10) null, + web_city varchar(60) null, + web_county varchar(30) null, + web_state char(2) null, + web_zip char(10) null, + web_country varchar(20) null, + web_gmt_offset decimal(5,2) null, + web_tax_percentage decimal(5,2) null +); diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/yaml/q1.yaml b/src/query/service/tests/it/sql/planner/optimizer/data/yaml/q1.yaml new file mode 100644 index 0000000000000..236cfcc478c6b --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/yaml/q1.yaml @@ -0,0 +1,282 @@ +name: "Q1" +description: "TPC-DS Query 1 optimizer test" + +sql: | + WITH customer_total_return + AS (SELECT sr_customer_sk AS ctr_customer_sk, + sr_store_sk AS ctr_store_sk, + Sum(sr_return_amt) AS ctr_total_return + FROM store_returns, + date_dim + WHERE sr_returned_date_sk = d_date_sk + AND d_year = 2001 + GROUP BY sr_customer_sk, + sr_store_sk) + SELECT c_customer_id + FROM customer_total_return ctr1, + store, + customer + WHERE ctr1.ctr_total_return > (SELECT Avg(ctr_total_return) * 1.2 + FROM customer_total_return ctr2 + WHERE ctr1.ctr_store_sk = ctr2.ctr_store_sk) + AND s_store_sk = ctr1.ctr_store_sk + AND s_state = 'TN' + AND ctr1.ctr_customer_sk = c_customer_sk + ORDER BY c_customer_id + LIMIT 100 + +# Table statistics derived from snow_plan's TableScan information +table_statistics: + date_dim: + num_rows: 73049 # Estimated based on typical date dimension cardinality + data_size: 2138624 # Directly from snow_plan: "bytes: 2,138,624" + number_of_segments: 1 # From snow_plan: "partitions: 1/1" + + store_returns: + num_rows: 287000000 # Estimated based on data size and typical row size + data_size: 124763446272 # Directly from snow_plan: "bytes: 124,763,446,272" + number_of_segments: 7070 # From snow_plan: "partitions: 7070/7070" + + store: + num_rows: 1002 # Estimated based on typical store dimension cardinality + data_size: 135680 # Directly from snow_plan: "bytes: 135,680" + number_of_segments: 1 # From snow_plan: "partitions: 1/1" + + customer: + num_rows: 12000000 # Estimated based on typical customer dimension size + data_size: 2328538624 # Directly from snow_plan: "bytes: 2,328,538,624" + number_of_segments: 261 # From snow_plan: "partitions: 261/261" + +# Column statistics derived from query predicates and typical TPC-DS data distributions +column_statistics: + # Date dimension columns used in the query + date_dim.d_year: + min: 1990 # Typical range for TPC-DS + max: 2010 # Typical range for TPC-DS + ndv: 21 # Based on min/max range (2010-1990+1) + null_count: 0 # Primary dimension columns typically don't have nulls + + date_dim.d_date_sk: + min: 1 # Typical starting value for surrogate key + max: 73049 # Based on table row count + ndv: 73049 # Primary key, so NDV equals row count + null_count: 0 # Primary key cannot be null + + # Store returns columns used in the query + store_returns.sr_returned_date_sk: + min: 1 # Matches date_dim.d_date_sk min + max: 73049 # Matches date_dim.d_date_sk max + ndv: 73049 # Foreign key to date_dim + null_count: 287998 # Inferred from filter in snow_plan: "STORE_RETURNS.SR_RETURNED_DATE_SK IS NOT NULL" + + store_returns.sr_customer_sk: + min: 1 # Typical starting value for surrogate key + max: 12000000 # Matches customer.c_customer_sk max + ndv: 11000000 # Estimated as slightly less than customer table cardinality + null_count: 143500 # Inferred from filter in snow_plan: "STORE_RETURNS.SR_CUSTOMER_SK IS NOT NULL" + + store_returns.sr_store_sk: + min: 1 # Typical starting value for surrogate key + max: 1002 # Matches store.s_store_sk max + ndv: 1002 # Foreign key to store table + null_count: 143500 # Inferred from filter in snow_plan: "STORE_RETURNS.SR_STORE_SK IS NOT NULL" + + store_returns.sr_return_amt: + min: 0.01 # Minimum reasonable return amount + max: 10000.00 # Maximum reasonable return amount + ndv: 100000 # Estimated based on typical distribution + null_count: 0 # Return amount is typically not null + + # Store columns used in the query + store.s_store_sk: + min: 1 # Typical starting value for surrogate key + max: 1002 # Based on estimated row count + ndv: 1002 # Primary key, so NDV equals row count + null_count: 0 # Primary key cannot be null + + store.s_state: + min: "AK" # Alaska (alphabetically first US state) + max: "WY" # Wyoming (alphabetically last US state) + ndv: 50 # Number of US states + null_count: 0 # State is typically not null + + # Customer columns used in the query + customer.c_customer_sk: + min: 1 # Typical starting value for surrogate key + max: 12000000 # Based on estimated row count + ndv: 12000000 # Primary key, so NDV equals row count + null_count: 0 # Primary key cannot be null + + customer.c_customer_id: + min: "AAAAAAAAAAAAAA" # Lexicographically smallest possible customer ID + max: "ZZZZZZZZZZZZZZ" # Lexicographically largest possible customer ID + ndv: 12000000 # Same as c_customer_sk (1:1 relationship) + null_count: 0 # Customer ID is typically not null + +raw_plan: | + Limit + ├── limit: [100] + ├── offset: [0] + └── Sort + ├── sort keys: [default.customer.c_customer_id (#79) ASC] + ├── limit: [NONE] + └── EvalScalar + ├── scalars: [customer.c_customer_id (#79) AS (#79)] + └── Filter + ├── filters: [gt(ctr1.ctr_total_return (#48), SUBQUERY), eq(store.s_store_sk (#49), ctr1.ctr_store_sk (#7)), eq(store.s_state (#73), 'TN'), eq(ctr1.ctr_customer_sk (#3), customer.c_customer_sk (#78))] + └── Join(Cross) + ├── build keys: [] + ├── probe keys: [] + ├── other filters: [] + ├── Join(Cross) + │ ├── build keys: [] + │ ├── probe keys: [] + │ ├── other filters: [] + │ ├── EvalScalar + │ │ ├── scalars: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7), Sum(sr_return_amt) (#48) AS (#48)] + │ │ └── Aggregate(Initial) + │ │ ├── group items: [store_returns.sr_customer_sk (#3), store_returns.sr_store_sk (#7)] + │ │ ├── aggregate functions: [Sum(sr_return_amt) (#48)] + │ │ └── EvalScalar + │ │ ├── scalars: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7), store_returns.sr_return_amt (#11) AS (#11)] + │ │ └── Filter + │ │ ├── filters: [eq(store_returns.sr_returned_date_sk (#0), date_dim.d_date_sk (#20)), eq(date_dim.d_year (#26), 2001)] + │ │ └── Join(Cross) + │ │ ├── build keys: [] + │ │ ├── probe keys: [] + │ │ ├── other filters: [] + │ │ ├── Scan + │ │ │ ├── table: default.store_returns + │ │ │ ├── filters: [] + │ │ │ ├── order by: [] + │ │ │ └── limit: NONE + │ │ └── Scan + │ │ ├── table: default.date_dim + │ │ ├── filters: [] + │ │ ├── order by: [] + │ │ └── limit: NONE + │ └── Scan + │ ├── table: default.store + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.customer + ├── filters: [] + ├── order by: [] + └── limit: NONE + +optimized_plan: | + Limit + ├── limit: [100] + ├── offset: [0] + └── Sort + ├── sort keys: [default.customer.c_customer_id (#79) ASC] + ├── limit: [100] + └── EvalScalar + ├── scalars: [customer.c_customer_id (#79) AS (#79), ctr1.ctr_total_return (#48) AS (#154), scalar_subquery_147 (#147) AS (#155), store.s_store_sk (#49) AS (#156), ctr1.ctr_store_sk (#7) AS (#157), store.s_state (#73) AS (#158), ctr1.ctr_customer_sk (#3) AS (#159), customer.c_customer_sk (#78) AS (#160)] + └── Join(Inner) + ├── build keys: [sr_store_sk (#103)] + ├── probe keys: [sr_store_sk (#7)] + ├── other filters: [gt(ctr1.ctr_total_return (#48), scalar_subquery_147 (#147))] + ├── Join(Inner) + │ ├── build keys: [customer.c_customer_sk (#78)] + │ ├── probe keys: [ctr1.ctr_customer_sk (#3)] + │ ├── other filters: [] + │ ├── Aggregate(Final) + │ │ ├── group items: [store_returns.sr_customer_sk (#3), store_returns.sr_store_sk (#7)] + │ │ ├── aggregate functions: [Sum(sr_return_amt) (#48)] + │ │ └── Aggregate(Partial) + │ │ ├── group items: [store_returns.sr_customer_sk (#3), store_returns.sr_store_sk (#7)] + │ │ ├── aggregate functions: [Sum(sr_return_amt) (#48)] + │ │ └── EvalScalar + │ │ ├── scalars: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7), store_returns.sr_return_amt (#11) AS (#11), store_returns.sr_returned_date_sk (#0) AS (#148), date_dim.d_date_sk (#20) AS (#149), date_dim.d_year (#26) AS (#150)] + │ │ └── Join(Inner) + │ │ ├── build keys: [date_dim.d_date_sk (#20)] + │ │ ├── probe keys: [store_returns.sr_returned_date_sk (#0)] + │ │ ├── other filters: [] + │ │ ├── Scan + │ │ │ ├── table: default.store_returns + │ │ │ ├── filters: [] + │ │ │ ├── order by: [] + │ │ │ └── limit: NONE + │ │ └── Scan + │ │ ├── table: default.date_dim + │ │ ├── filters: [eq(date_dim.d_year (#26), 2001)] + │ │ ├── order by: [] + │ │ └── limit: NONE + │ └── Scan + │ ├── table: default.customer + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Inner) + ├── build keys: [sr_store_sk (#103)] + ├── probe keys: [store.s_store_sk (#49)] + ├── other filters: [] + ├── Scan + │ ├── table: default.store + │ ├── filters: [eq(store.s_state (#73), 'TN')] + │ ├── order by: [] + │ └── limit: NONE + └── EvalScalar + ├── scalars: [sr_store_sk (#103) AS (#103), multiply(divide(sum(ctr_total_return) (#145), if(eq(count(ctr_total_return) (#146), 0), 1, count(ctr_total_return) (#146))), 1.2) AS (#147)] + └── Aggregate(Final) + ├── group items: [subquery_103 (#103)] + ├── aggregate functions: [sum(ctr_total_return) (#145), count(ctr_total_return) (#146)] + └── Aggregate(Partial) + ├── group items: [subquery_103 (#103)] + ├── aggregate functions: [sum(ctr_total_return) (#145), count(ctr_total_return) (#146)] + └── Aggregate(Final) + ├── group items: [store_returns.sr_customer_sk (#99), store_returns.sr_store_sk (#103)] + ├── aggregate functions: [Sum(sr_return_amt) (#144)] + └── Aggregate(Partial) + ├── group items: [store_returns.sr_customer_sk (#99), store_returns.sr_store_sk (#103)] + ├── aggregate functions: [Sum(sr_return_amt) (#144)] + └── EvalScalar + ├── scalars: [store_returns.sr_customer_sk (#99) AS (#99), store_returns.sr_store_sk (#103) AS (#103), store_returns.sr_return_amt (#107) AS (#107), store_returns.sr_returned_date_sk (#96) AS (#151), date_dim.d_date_sk (#116) AS (#152), date_dim.d_year (#122) AS (#153)] + └── Join(Inner) + ├── build keys: [date_dim.d_date_sk (#116)] + ├── probe keys: [store_returns.sr_returned_date_sk (#96)] + ├── other filters: [] + ├── Scan + │ ├── table: default.store_returns + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.date_dim + ├── filters: [eq(date_dim.d_year (#122), 2001)] + ├── order by: [] + └── limit: NONE + +# Converted from tabular format to tree format based on parent-child relationships +snow_plan: | + Result + └── SortWithLimit [sortKey: (CUSTOMER.C_CUSTOMER_ID ASC NULLS LAST), rowCount: 100] + └── InnerJoin [joinKey: (CTR1.CTR_CUSTOMER_SK = CUSTOMER.C_CUSTOMER_SK)] + ├── InnerJoin [joinKey: (STORE.S_STORE_SK = CTR1.CTR_STORE_SK)] + │ ├── Filter [STORE.S_STATE = 'TN'] + │ │ └── TableScan [SNOWFLAKE_SAMPLE_DATA.TPCDS_SF10TCL.STORE] [S_STORE_SK, S_STATE] [partitions: 1/1, bytes: 135,680] + │ └── InnerJoin [joinKey: (CTR2.CTR_STORE_SK = CTR1.CTR_STORE_SK), joinFilter: (CTR1.CTR_TOTAL_RETURN) > (((SUM(CTR2.CTR_TOTAL_RETURN)) / (NVL(COUNT(CTR2.CTR_TOTAL_RETURN), 0))) * 1.2)] + │ ├── Filter [(SUM(CTR2.CTR_TOTAL_RETURN) IS NOT NULL) AND (COUNT(CTR2.CTR_TOTAL_RETURN) IS NOT NULL)] + │ │ └── Aggregate [aggExprs: [SUM(CTR2.CTR_TOTAL_RETURN), COUNT(CTR2.CTR_TOTAL_RETURN)], groupKeys: [CTR2.CTR_STORE_SK]] + │ │ └── JoinFilter [joinKey: (STORE.S_STORE_SK = CTR1.CTR_STORE_SK)] + │ │ └── WithReference [CTR2] + │ │ └── Filter [STORE_RETURNS.SR_STORE_SK IS NOT NULL] + │ │ └── WithClause [CUSTOMER_TOTAL_RETURN] + │ │ └── Aggregate [aggExprs: [SUM(SUM(SUM(STORE_RETURNS.SR_RETURN_AMT)))], groupKeys: [STORE_RETURNS.SR_CUSTOMER_SK, STORE_RETURNS.SR_STORE_SK]] + │ │ └── Aggregate [aggExprs: [SUM(SUM(STORE_RETURNS.SR_RETURN_AMT))], groupKeys: [STORE_RETURNS.SR_CUSTOMER_SK, STORE_RETURNS.SR_STORE_SK]] + │ │ └── InnerJoin [joinKey: (DATE_DIM.D_DATE_SK = STORE_RETURNS.SR_RETURNED_DATE_SK)] + │ │ ├── Filter [DATE_DIM.D_YEAR = 2001] + │ │ │ └── TableScan [SNOWFLAKE_SAMPLE_DATA.TPCDS_SF10TCL.DATE_DIM] [D_DATE_SK, D_YEAR] [partitions: 1/1, bytes: 2,138,624] + │ │ └── Aggregate [aggExprs: [SUM(STORE_RETURNS.SR_RETURN_AMT)], groupKeys: [STORE_RETURNS.SR_CUSTOMER_SK, STORE_RETURNS.SR_STORE_SK, STORE_RETURNS.SR_RETURNED_DATE_SK]] + │ │ └── Filter [STORE_RETURNS.SR_RETURNED_DATE_SK IS NOT NULL] + │ │ └── JoinFilter [joinKey: (DATE_DIM.D_DATE_SK = STORE_RETURNS.SR_RETURNED_DATE_SK)] + │ │ └── TableScan [SNOWFLAKE_SAMPLE_DATA.TPCDS_SF10TCL.STORE_RETURNS] [SR_RETURNED_DATE_SK, SR_CUSTOMER_SK, SR_STORE_SK, SR_RETURN_AMT] [partitions: 7070/7070, bytes: 124,763,446,272] + │ └── JoinFilter [joinKey: (STORE.S_STORE_SK = CTR1.CTR_STORE_SK)] + │ └── WithReference [CTR1] + │ └── Filter [(STORE_RETURNS.SR_STORE_SK IS NOT NULL) AND (STORE_RETURNS.SR_CUSTOMER_SK IS NOT NULL)] + │ └── WithClause [CUSTOMER_TOTAL_RETURN] (reference to earlier WITH clause) + └── JoinFilter [joinKey: (CTR1.CTR_CUSTOMER_SK = CUSTOMER.C_CUSTOMER_SK)] + └── TableScan [SNOWFLAKE_SAMPLE_DATA.TPCDS_SF10TCL.CUSTOMER] [C_CUSTOMER_SK, C_CUSTOMER_ID] [partitions: 261/261, bytes: 2,328,538,624] diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/yaml/q3.yaml b/src/query/service/tests/it/sql/planner/optimizer/data/yaml/q3.yaml new file mode 100644 index 0000000000000..25569122a3650 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/data/yaml/q3.yaml @@ -0,0 +1,190 @@ +name: "Q3" +description: "TPC-DS Query 3 optimizer test" + +sql: | + SELECT dt.d_year, item.i_brand_id brand_id, item.i_brand brand, + SUM(ss_ext_sales_price) AS sum_agg + FROM date_dim dt, store_sales, item + WHERE dt.d_date_sk = store_sales.ss_sold_date_sk + AND store_sales.ss_item_sk = item.i_item_sk + AND item.i_manufact_id = 128 + AND dt.d_moy = 11 + GROUP BY dt.d_year, item.i_brand, item.i_brand_id + ORDER BY dt.d_year, sum_agg DESC, brand_id + LIMIT 100 + +table_statistics: + date_dim: + num_rows: 73049 # Estimated based on typical date dimension cardinality + data_size: 2138624 # From snow_plan: "TableScan (DATE_DIM as DT) [partitions: 1/1, bytes: 2,138,624]" + data_size_compressed: 1069312 # Estimated as 50% of data_size + index_size: 427724 # Estimated as 20% of data_size + number_of_blocks: 21 # Estimated based on data_size + number_of_segments: 1 # From snow_plan: "partitions: 1/1" + store_sales: + num_rows: 2879987999 # Estimated based on data size and typical row size + data_size: 1212628258304 # From snow_plan: "TableScan (STORE_SALES) [partitions: 70,412/72,718, bytes: 1,212,628,258,304]" + data_size_compressed: 606314129152 # Estimated as 50% of data_size + index_size: 242525651660 # Estimated as 20% of data_size + number_of_blocks: 12126282 # Estimated based on data_size + number_of_segments: 70412 # From snow_plan: "partitions: 70,412/72,718" + item: + num_rows: 462000 # Estimated based on ss_item_sk range and typical item dimension size + data_size: 23811584 # From snow_plan: "TableScan (ITEM) [partitions: 2/2, bytes: 23,811,584]" + data_size_compressed: 11905792 # Estimated as 50% of data_size + index_size: 4762316 # Estimated as 20% of data_size + number_of_blocks: 238 # Estimated based on data_size + number_of_segments: 2 # From snow_plan: "partitions: 2/2" + +column_statistics: + date_dim.d_year: + min: 1990 # Typical range for TPC-DS + max: 2000 # Typical range for TPC-DS + ndv: 11 # Based on min/max range + null_count: 0 # Primary dimension columns typically don't have nulls + date_dim.d_date_sk: + min: 1 # Typical starting value for surrogate key + max: 73049 # Based on table row count + ndv: 73049 # Primary key, so NDV equals row count + null_count: 0 # Primary key cannot be null + date_dim.d_moy: + min: 1 # January + max: 12 # December + ndv: 12 # 12 months in a year + null_count: 0 # Date parts typically don't have nulls + store_sales.ss_ext_sales_price: + min: 0.01 # Minimum reasonable sales price + max: 30000.00 # Maximum reasonable extended sales price + ndv: 573997 # Estimated as ~20% of row count + null_count: 0 # Sales amount is typically not null + store_sales.ss_sold_date_sk: + min: 1 # Matches date_dim.d_date_sk min + max: 73049 # Matches date_dim.d_date_sk max + ndv: 73049 # Foreign key to date_dim + null_count: 287998 # From snow_plan filter: "STORE_SALES.SS_SOLD_DATE_SK IS NOT NULL" implies some nulls exist + store_sales.ss_item_sk: + min: 1 # Typical starting value for surrogate key + max: 462000 # Matches item.i_item_sk max + ndv: 462000 # Foreign key to item table + null_count: 0 # Required join key is typically not null + item.i_brand_id: + min: 1 # Typical starting value for ID + max: 1000 # Typical range for TPC-DS + ndv: 948 # Estimated based on TPC-DS typical cardinality + null_count: 0 # Brand ID is typically not null + item.i_brand: + min: "AAAAAAAAAAAAAA" # Lexicographically smallest possible brand name + max: "zzzzzzzzzzzzzz" # Lexicographically largest possible brand name + ndv: 948 # Same as i_brand_id (1:1 relationship) + null_count: 0 # Brand name is typically not null + item.i_item_sk: + min: 1 # Typical starting value for surrogate key + max: 462000 # Based on estimated row count + ndv: 462000 # Primary key, so NDV equals row count + null_count: 0 # Primary key cannot be null + item.i_manufact_id: + min: 1 # Typical starting value for ID + max: 1000 # Typical range for TPC-DS + ndv: 1000 # Based on typical TPC-DS cardinality + null_count: 0 # Manufacturer ID is typically not null + +raw_plan: | + Limit + ├── limit: [100] + ├── offset: [0] + └── Sort + ├── sort keys: [default.date_dim.d_year (#6) ASC, derived.SUM(ss_ext_sales_price) (#73) DESC, default.item.i_brand_id (#58) ASC] + ├── limit: [NONE] + └── EvalScalar + ├── scalars: [dt.d_year (#6) AS (#6), item.i_brand_id (#58) AS (#58), item.i_brand (#59) AS (#59), SUM(ss_ext_sales_price) (#73) AS (#73)] + └── Aggregate(Initial) + ├── group items: [dt.d_year (#6), item.i_brand (#59), item.i_brand_id (#58)] + ├── aggregate functions: [SUM(ss_ext_sales_price) (#73)] + └── EvalScalar + ├── scalars: [dt.d_year (#6) AS (#6), store_sales.ss_ext_sales_price (#43) AS (#43), item.i_brand_id (#58) AS (#58), item.i_brand (#59) AS (#59)] + └── Filter + ├── filters: [eq(dt.d_date_sk (#0), store_sales.ss_sold_date_sk (#28)), eq(store_sales.ss_item_sk (#30), item.i_item_sk (#51)), eq(item.i_manufact_id (#64), 128), eq(dt.d_moy (#8), 11)] + └── Join(Cross) + ├── build keys: [] + ├── probe keys: [] + ├── other filters: [] + ├── Join(Cross) + │ ├── build keys: [] + │ ├── probe keys: [] + │ ├── other filters: [] + │ ├── Scan + │ │ ├── table: default.date_dim + │ │ ├── filters: [] + │ │ ├── order by: [] + │ │ └── limit: NONE + │ └── Scan + │ ├── table: default.store_sales + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.item + ├── filters: [] + ├── order by: [] + └── limit: NONE + +optimized_plan: | + Limit + ├── limit: [100] + ├── offset: [0] + └── Sort + ├── sort keys: [default.date_dim.d_year (#6) ASC, derived.SUM(ss_ext_sales_price) (#73) DESC, default.item.i_brand_id (#58) ASC] + ├── limit: [100] + └── Aggregate(Final) + ├── group items: [dt.d_year (#6), item.i_brand (#59), item.i_brand_id (#58)] + ├── aggregate functions: [SUM(ss_ext_sales_price) (#73)] + └── Aggregate(Partial) + ├── group items: [dt.d_year (#6), item.i_brand (#59), item.i_brand_id (#58)] + ├── aggregate functions: [SUM(ss_ext_sales_price) (#73)] + └── EvalScalar + ├── scalars: [dt.d_year (#6) AS (#6), store_sales.ss_ext_sales_price (#43) AS (#43), item.i_brand_id (#58) AS (#58), item.i_brand (#59) AS (#59), dt.d_date_sk (#0) AS (#74), store_sales.ss_sold_date_sk (#28) AS (#75), store_sales.ss_item_sk (#30) AS (#76), item.i_item_sk (#51) AS (#77), item.i_manufact_id (#64) AS (#78), dt.d_moy (#8) AS (#79)] + └── Join(Inner) + ├── build keys: [store_sales.ss_sold_date_sk (#28)] + ├── probe keys: [dt.d_date_sk (#0)] + ├── other filters: [] + ├── Scan + │ ├── table: default.date_dim + │ ├── filters: [eq(date_dim.d_moy (#8), 11)] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Inner) + ├── build keys: [item.i_item_sk (#51)] + ├── probe keys: [store_sales.ss_item_sk (#30)] + ├── other filters: [] + ├── Scan + │ ├── table: default.store_sales + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.item + ├── filters: [eq(item.i_manufact_id (#64), 128)] + ├── order by: [] + └── limit: NONE + +snow_plan: | + Result [output: DT.D_YEAR, ITEM.I_BRAND_ID, ITEM.I_BRAND, SUM(...)] + └── SortWithLimit [limit: 100] + ├── sort keys: [DT.D_YEAR ASC NULLS LAST, SUM(SS_EXT_SALES_PRICE) DESC NULLS FIRST, ITEM.I_BRAND_ID ASC NULLS LAST] + └── Aggregate [group by: DT.D_YEAR, ITEM.I_BRAND, ITEM.I_BRAND_ID] + └── Aggregate [group by: DT.D_YEAR, ITEM.I_BRAND, ITEM.I_BRAND_ID] + └── InnerJoin [join key: (DT.D_DATE_SK = STORE_SALES.SS_SOLD_DATE_SK)] + ├── Filter [condition: DT.D_MOY = 11] + │ └── TableScan (DATE_DIM as DT) [partitions: 1/1, bytes: 2,138,624] + │ └── columns: [D_DATE_SK, D_YEAR, D_MOY] + └── Aggregate [group by: ITEM.I_BRAND_ID, ITEM.I_BRAND, STORE_SALES.SS_SOLD_DATE_SK] + └── InnerJoin [join key: (ITEM.I_ITEM_SK = STORE_SALES.SS_ITEM_SK)] + ├── Aggregate [group by: ITEM.I_ITEM_SK, ITEM.I_BRAND_ID, ITEM.I_BRAND] + │ └── Filter [condition: ITEM.I_MANUFACT_ID = 128] + │ └── TableScan (ITEM) [partitions: 2/2, bytes: 23,811,584] + │ └── columns: [I_ITEM_SK, I_BRAND_ID, I_BRAND, I_MANUFACT_ID] + └── Aggregate [group by: STORE_SALES.SS_SOLD_DATE_SK, STORE_SALES.SS_ITEM_SK] + └── Filter [condition: STORE_SALES.SS_SOLD_DATE_SK IS NOT NULL] + └── JoinFilter [join key: (DT.D_DATE_SK = STORE_SALES.SS_SOLD_DATE_SK)] + └── TableScan (STORE_SALES) [partitions: 70,412/72,718, bytes: 1,212,628,258,304] + └── columns: [SS_SOLD_DATE_SK, SS_ITEM_SK, SS_EXT_SALES_PRICE] diff --git a/src/query/service/tests/it/sql/planner/optimizer/mod.rs b/src/query/service/tests/it/sql/planner/optimizer/mod.rs index f21048ed2e1d4..3dd9c1155e871 100644 --- a/src/query/service/tests/it/sql/planner/optimizer/mod.rs +++ b/src/query/service/tests/it/sql/planner/optimizer/mod.rs @@ -13,3 +13,5 @@ // limitations under the License. mod agg_index_query_rewrite; +pub mod test_utils; +mod tpcds_test; diff --git a/src/query/service/tests/it/sql/planner/optimizer/test_utils.rs b/src/query/service/tests/it/sql/planner/optimizer/test_utils.rs new file mode 100644 index 0000000000000..b5fd8d356da33 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/test_utils.rs @@ -0,0 +1,260 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use databend_common_catalog::catalog::CatalogManager; +use databend_common_catalog::BasicColumnStatistics; +use databend_common_catalog::TableStatistics; +use databend_common_exception::Result; +use databend_common_sql::optimizer::OptimizerContext; +use databend_common_sql::optimizer::SExpr; +use databend_common_sql::planner::optimize; +use databend_common_sql::planner::Binder; +use databend_common_sql::planner::Metadata; +use databend_common_sql::plans::Plan; +use databend_common_sql::plans::RelOperator; +use databend_common_sql::plans::Statistics; +use databend_common_sql::BaseTableColumn; +use databend_common_sql::ColumnEntry; +use databend_common_sql::IndexType; +use databend_common_sql::MetadataRef; +use databend_common_sql::NameResolutionContext; +use databend_common_sql::Planner; +use databend_query::interpreters::InterpreterFactory; +use databend_query::sessions::QueryContext; + +/// A builder for setting test statistics on plans +#[derive(Clone, Debug, Default)] +pub struct TestStatisticsBuilder { + // Map of table name to table statistics + table_stats: HashMap>, + // Map of (table name, column name) to column statistics + column_stats: HashMap<(String, String), Option>, +} + +impl TestStatisticsBuilder { + /// Create a new TestStatisticsBuilder + pub fn new() -> Self { + Self { + table_stats: HashMap::new(), + column_stats: HashMap::new(), + } + } + + /// Set statistics for a specific table + pub fn set_table_statistics( + &mut self, + table_name: &str, + stats: Option, + ) -> &mut Self { + self.table_stats.insert(table_name.to_string(), stats); + self + } + + /// Set statistics for a specific column in a table + pub fn set_column_statistics( + &mut self, + table_name: &str, + column_name: &str, + stats: Option, + ) -> &mut Self { + self.column_stats + .insert((table_name.to_string(), column_name.to_string()), stats); + self + } + + /// Apply the statistics to a plan + pub fn apply_to_plan(&self, plan: &mut Plan) -> Result<()> { + if let Plan::Query { + s_expr, metadata, .. + } = plan + { + let new_s_expr = self.apply_to_sexpr(s_expr, metadata)?; + *s_expr = Box::new(new_s_expr); + } + Ok(()) + } + + /// Internal helper to apply statistics to an SExpr + fn apply_to_sexpr(&self, s_expr: &SExpr, metadata: &MetadataRef) -> Result { + let mut result = s_expr.clone(); + + if let RelOperator::Scan(scan) = s_expr.plan.as_ref() { + let table_index = scan.table_index; + let metadata_guard = metadata.read(); + let table = metadata_guard.table(table_index); + let table_name = table.name(); + + // Check if we have statistics for this table + let should_update = self.table_stats.contains_key(table_name) + || self.column_stats.iter().any(|((t, _), _)| t == table_name); + + if should_update { + let mut new_scan = scan.clone(); + + // Create a mutable copy of the statistics + let mut stats = Statistics { + table_stats: None, + column_stats: HashMap::new(), + histograms: HashMap::new(), + }; + + // Update table statistics if specified + if let Some(table_stats) = self.table_stats.get(table_name) { + stats.table_stats = *table_stats; + } + + // Update column statistics if specified + let metadata_guard = metadata.read(); + let columns = metadata_guard.columns_by_table_index(table_index); + for (idx, column) in columns.iter().enumerate() { + if let ColumnEntry::BaseTableColumn(BaseTableColumn { column_name, .. }) = + column + { + if let Some(col_stats_option) = self + .column_stats + .get(&(table_name.to_string(), column_name.clone())) + { + stats + .column_stats + .insert(idx as IndexType, col_stats_option.clone()); + } + } + } + + // Replace the statistics + new_scan.statistics = Arc::new(stats); + result = result.replace_plan(Arc::new(RelOperator::Scan(new_scan))); + } + } + + // Recursively process children + let mut new_children = Vec::new(); + for child in s_expr.children() { + new_children.push(Arc::new(self.apply_to_sexpr(child, metadata)?)); + } + + if !new_children.is_empty() { + result = result.replace_children(new_children); + } + + Ok(result) + } +} + +// Extension trait for Plan to make it easier to set statistics +pub trait PlanStatisticsExt { + /// Set statistics for a specific table + fn set_table_statistics( + &mut self, + table_name: &str, + stats: Option, + ) -> Result<&mut Self>; + + /// Set statistics for a specific column in a table + fn set_column_statistics( + &mut self, + table_name: &str, + column_name: &str, + stats: Option, + ) -> Result<&mut Self>; +} + +impl PlanStatisticsExt for Plan { + fn set_table_statistics( + &mut self, + table_name: &str, + stats: Option, + ) -> Result<&mut Self> { + let mut builder = TestStatisticsBuilder::new(); + builder.set_table_statistics(table_name, stats); + builder.apply_to_plan(self)?; + Ok(self) + } + + fn set_column_statistics( + &mut self, + table_name: &str, + column_name: &str, + stats: Option, + ) -> Result<&mut Self> { + let mut builder = TestStatisticsBuilder::new(); + builder.set_column_statistics(table_name, column_name, stats); + builder.apply_to_plan(self)?; + Ok(self) + } +} + +// TPC-DS Test Utilities + +/// Plan SQL query to get a Plan object +pub async fn plan_sql(ctx: &Arc, sql: &str) -> Result { + let mut planner = Planner::new(ctx.clone()); + let (plan, _) = planner.plan_sql(sql).await?; + + Ok(plan) +} + +/// Execute SQL statement +pub async fn execute_sql(ctx: &Arc, sql: &str) -> Result<()> { + let plan = plan_sql(ctx, sql).await?; + let it = InterpreterFactory::get(ctx.clone(), &plan).await?; + let _ = it.execute(ctx.clone()).await?; + Ok(()) +} + +/// Get raw plan from SQL +pub async fn raw_plan(ctx: &Arc, sql: &str) -> Result { + let planner = Planner::new(ctx.clone()); + let extras = planner.parse_sql(sql)?; + + let metadata = Arc::new(parking_lot::RwLock::new(Metadata::default())); + let name_resolution_ctx = NameResolutionContext::default(); + + let binder = Binder::new( + ctx.clone(), + CatalogManager::instance(), + name_resolution_ctx, + metadata.clone(), + ); + + binder.bind(&extras.statement).await +} + +/// Optimize a plan +pub async fn optimize_plan(ctx: &Arc, plan: Plan) -> Result { + // Extract the metadata from the plan if it's a Query variant + let metadata = match &plan { + Plan::Query { metadata, .. } => metadata.clone(), + _ => { + // If it's not a Query, we still need to provide a metadata, but log a warning + eprintln!("Warning: Plan is not a Query variant, creating new metadata"); + Arc::new(parking_lot::RwLock::new(Metadata::default())) + } + }; + + let opt_ctx = OptimizerContext::new(ctx.clone(), metadata); + optimize(opt_ctx, plan).await +} + +/// Test case structure for optimizer tests +pub struct TestCase { + pub name: &'static str, + pub sql: &'static str, + pub stats_setup: fn(&mut Plan) -> Result<()>, + pub raw_plan: &'static str, // Expected raw plan string + pub expected_plan: &'static str, // Expected optimized plan string +} diff --git a/src/query/service/tests/it/sql/planner/optimizer/tpcds_test.rs b/src/query/service/tests/it/sql/planner/optimizer/tpcds_test.rs new file mode 100644 index 0000000000000..eedd04fc76163 --- /dev/null +++ b/src/query/service/tests/it/sql/planner/optimizer/tpcds_test.rs @@ -0,0 +1,311 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cell::RefCell; +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::sync::Arc; +use std::thread_local; + +use databend_common_catalog::BasicColumnStatistics; +use databend_common_catalog::TableStatistics; +use databend_common_exception::ErrorCode; +use databend_common_exception::Result; +use databend_common_expression::types::NumberScalar; +use databend_common_expression::Scalar; +use databend_common_sql::plans::Plan; +use databend_common_storage::Datum; +use databend_query::sessions::QueryContext; +use databend_query::test_kits::TestFixture; +use serde::Deserialize; +use serde::Serialize; + +use crate::sql::planner::optimizer::test_utils::execute_sql; +use crate::sql::planner::optimizer::test_utils::optimize_plan; +use crate::sql::planner::optimizer::test_utils::raw_plan; +use crate::sql::planner::optimizer::test_utils::PlanStatisticsExt; +use crate::sql::planner::optimizer::test_utils::TestCase; + +/// YAML representation of a test case +#[derive(Debug, Serialize, Deserialize)] +struct YamlTestCase { + name: String, + description: Option, + sql: String, + table_statistics: HashMap, + column_statistics: HashMap, + raw_plan: String, + optimized_plan: String, + snow_plan: Option, +} + +/// YAML representation of table statistics +#[derive(Debug, Serialize, Deserialize, Clone)] +struct YamlTableStatistics { + num_rows: Option, + data_size: Option, + data_size_compressed: Option, + index_size: Option, + number_of_blocks: Option, + number_of_segments: Option, +} + +/// YAML representation of column statistics +#[derive(Debug, Serialize, Deserialize, Clone)] +struct YamlColumnStatistics { + min: Option, + max: Option, + ndv: Option, + null_count: Option, +} + +type TableStatsMap = HashMap; +type ColumnStatsMap = HashMap; + +// Thread-local storage for test case data +thread_local! { + static TEST_CASE_DATA: RefCell> = const { RefCell::new(None) }; +} + +/// Setup TPC-DS tables with required schema +async fn setup_tpcds_tables(ctx: &Arc) -> Result<()> { + // Get the base path for table definitions + let base_path = + Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/it/sql/planner/optimizer/data/tables"); + + // Check if the directory exists + if !base_path.exists() { + return Err(ErrorCode::UnknownException(format!( + "Tables directory not found at {:?}", + base_path + ))); + } + + // Read all SQL files from the tables directory + for entry in fs::read_dir(&base_path)? { + let entry = entry?; + let path = entry.path(); + + // Only process SQL files + if path.is_file() && path.extension().is_some_and(|ext| ext == "sql") { + // Extract table name from filename (without extension) + if let Some(file_stem) = path.file_stem() { + if let Some(table_name) = file_stem.to_str() { + let sql = fs::read_to_string(&path)?; + println!("Creating table: {}", table_name); + execute_sql(ctx, &sql).await?; + } + } + } + } + + Ok(()) +} + +/// Convert a YAML test case to a TestCase +fn create_test_case(yaml: YamlTestCase) -> Result { + // Store statistics data in thread-local storage + TEST_CASE_DATA.with(|data| { + *data.borrow_mut() = Some(( + yaml.table_statistics.clone(), + yaml.column_statistics.clone(), + )); + }); + + // Create a stats setup function that accesses the thread-local data + fn stats_setup(plan: &mut Plan) -> Result<()> { + TEST_CASE_DATA.with(|data_cell| { + if let Some((table_stats, column_stats)) = &*data_cell.borrow() { + // Set table statistics + for (table_name, stats) in table_stats { + plan.set_table_statistics( + table_name, + Some(TableStatistics { + num_rows: stats.num_rows, + data_size: stats.data_size, + data_size_compressed: stats.data_size_compressed, + index_size: stats.index_size, + number_of_blocks: stats.number_of_blocks, + number_of_segments: stats.number_of_segments, + }), + )?; + } + + // Set column statistics + for (key, stats) in column_stats { + let parts: Vec<&str> = key.split('.').collect(); + if parts.len() == 2 { + let (table, column) = (parts[0], parts[1]); + + plan.set_column_statistics( + table, + column, + Some(BasicColumnStatistics { + min: convert_to_datum(&stats.min), + max: convert_to_datum(&stats.max), + ndv: stats.ndv, + null_count: stats.null_count.unwrap_or(0), + }), + )?; + } + } + } + Ok(()) + }) + } + + Ok(TestCase { + name: Box::leak(yaml.name.into_boxed_str()), + sql: Box::leak(yaml.sql.into_boxed_str()), + stats_setup, + raw_plan: Box::leak(yaml.raw_plan.into_boxed_str()), + expected_plan: Box::leak(yaml.optimized_plan.into_boxed_str()), + }) +} + +/// Convert a JSON value to a Datum +fn convert_to_datum(value: &Option) -> Option { + if let Some(val) = value { + match val { + serde_json::Value::Number(n) => { + if let Some(i) = n.as_i64() { + // Convert to i64 datum using from_scalar + return Datum::from_scalar(Scalar::Number(NumberScalar::Int64(i))); + } else if let Some(f) = n.as_f64() { + // Convert to f64 datum using from_scalar + return Datum::from_scalar(Scalar::Number(NumberScalar::Float64(f.into()))); + } + } + serde_json::Value::String(s) => { + // Convert to string datum using from_scalar + return Datum::from_scalar(Scalar::String(s.clone())); + } + // Add other type conversions as needed + _ => {} + } + } + None +} + +/// Load test cases from YAML files +fn load_test_cases(base_path: &Path) -> Result> { + let yaml_dir = base_path.join("yaml"); + let mut test_cases = Vec::new(); + + if !yaml_dir.exists() { + return Ok(Vec::new()); + } + + for entry in fs::read_dir(yaml_dir)? { + let entry = entry?; + let path = entry.path(); + + if path.is_file() + && path + .extension() + .is_some_and(|ext| ext == "yaml" || ext == "yml") + { + let content = fs::read_to_string(&path)?; + let yaml_test_case: YamlTestCase = serde_yaml::from_str(&content) + .map_err(|e| ErrorCode::Internal(format!("Failed to parse YAML: {}", e)))?; + let test_case = create_test_case(yaml_test_case)?; + test_cases.push(test_case); + } + } + + Ok(test_cases) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn test_tpcds_optimizer() -> Result<()> { + // Create a test fixture with a query context + let fixture = TestFixture::setup().await?; + let ctx = fixture.new_query_ctx().await?; + + // Setup tables needed for TPC-DS queries + setup_tpcds_tables(&ctx).await?; + + // Load test cases from YAML files + let base_path = + Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/it/sql/planner/optimizer/data"); + + let tests = load_test_cases(&base_path)?; + + if tests.is_empty() { + println!("No test cases found in {:?}", base_path); + return Ok(()); + } + + // Run all test cases + for test in tests { + println!("\n\n========== Testing: {} ==========", test.name); + + // Parse SQL to get raw plan + let mut raw_plan = raw_plan(&ctx, test.sql).await?; + + // Set statistics for the plan + (test.stats_setup)(&mut raw_plan)?; + + // Print and verify raw plan + let raw_plan_str = raw_plan.format_indent(false)?; + println!("Raw plan:\n{}", raw_plan_str); + + // Verify raw plan matches expected + let actual_raw = raw_plan_str.trim(); + let expected_raw = test.raw_plan.trim(); + if actual_raw != expected_raw { + println!("Raw plan difference detected for test {}:\n", test.name); + println!("Expected raw plan:\n{}\n", expected_raw); + println!("Actual raw plan:\n{}\n", actual_raw); + // Update the expected output in the test case + println!( + "To fix the test, update the raw_plan in the test case to match the actual output." + ); + } + assert_eq!( + actual_raw, expected_raw, + "Test {} failed: raw plan does not match expected output", + test.name + ); + + // Optimize the plan + let optimized_plan = optimize_plan(&ctx, raw_plan).await?; + let optimized_plan_str = optimized_plan.format_indent(false)?; + println!("Optimized plan:\n{}", optimized_plan_str); + + // Verify the optimized plan matches expected output + let actual_optimized = optimized_plan_str.trim(); + let expected_optimized = test.expected_plan.trim(); + if actual_optimized != expected_optimized { + println!( + "Optimized plan difference detected for test {}:\n", + test.name + ); + println!("Expected optimized plan:\n{}\n", expected_optimized); + println!("Actual optimized plan:\n{}\n", actual_optimized); + // Update the expected output in the test case + println!("To fix the test, update the expected_plan in the test case to match the actual output."); + } + assert_eq!( + actual_optimized, expected_optimized, + "Test {} failed: optimized plan does not match expected output", + test.name + ); + + println!("✅ {} test passed!", test.name); + } + + Ok(()) +} diff --git a/src/query/sql/src/planner/mod.rs b/src/query/sql/src/planner/mod.rs index 73e35305a158b..5ce6f829ecc11 100644 --- a/src/query/sql/src/planner/mod.rs +++ b/src/query/sql/src/planner/mod.rs @@ -42,6 +42,7 @@ pub use bloom_index::BloomIndexColumns; pub use expression_parser::*; pub use format::format_scalar; pub use metadata::*; +pub use optimizer::optimize; pub use planner::get_query_kind; pub use planner::PlanExtras; pub use planner::Planner; diff --git a/src/query/sql/src/planner/optimizer/s_expr.rs b/src/query/sql/src/planner/optimizer/s_expr.rs index 5634844027461..2092155cd4248 100644 --- a/src/query/sql/src/planner/optimizer/s_expr.rs +++ b/src/query/sql/src/planner/optimizer/s_expr.rs @@ -46,7 +46,7 @@ use crate::ScalarExpr; Debug(bound = false, attrs = "#[recursive::recursive]") )] pub struct SExpr { - pub(crate) plan: Arc, + pub plan: Arc, pub(crate) children: Vec>, pub(crate) original_group: Option,