1515// specific language governing permissions and limitations
1616// under the License.
1717
18- //! Defines the join plan for executing partitions in parallel and then joining the results
19- //! into a set of partitions.
18+ //! [`HashJoinExec`] Partitioned Hash Join Operator
2019
2120use std:: fmt;
2221use std:: mem:: size_of;
@@ -78,29 +77,140 @@ use futures::{ready, Stream, StreamExt, TryStreamExt};
7877
7978type JoinLeftData = ( JoinHashMap , RecordBatch , MemoryReservation ) ;
8079
81- /// Join execution plan executes partitions in parallel and combines them into a set of
82- /// partitions.
80+ /// Join execution plan: Evaluates eqijoin predicates in parallel on multiple
81+ /// partitions using a hash table and an optional filter list to apply post
82+ /// join.
8383///
84- /// Filter expression expected to contain non-equality predicates that can not be pushed
85- /// down to any of join inputs.
86- /// In case of outer join, filter applied to only matched rows.
84+ /// # Join Expressions
85+ ///
86+ /// This implementation is optimized for evaluating eqijoin predicates (
87+ /// `<col1> = <col2>`) expressions, which are represented as a list of `Columns`
88+ /// in [`Self::on`].
89+ ///
90+ /// Non-equality predicates, which can not pushed down to a join inputs (e.g.
91+ /// `<col1> != <col2>`) are known as "filter expressions" and are evaluated
92+ /// after the equijoin predicates.
93+ ///
94+ /// # "Build Side" vs "Probe Side"
95+ ///
96+ /// HashJoin takes two inputs, which are referred to as the "build" and the
97+ /// "probe". The build side is the first child, and the probe side is the second
98+ /// child.
99+ ///
100+ /// The two inputs are treated differently and it is VERY important that the
101+ /// *smaller* input is placed on the build side to minimize the work of creating
102+ /// the hash table.
103+ ///
104+ /// ```text
105+ /// ┌───────────┐
106+ /// │ HashJoin │
107+ /// │ │
108+ /// └───────────┘
109+ /// │ │
110+ /// ┌─────┘ └─────┐
111+ /// ▼ ▼
112+ /// ┌────────────┐ ┌─────────────┐
113+ /// │ Input │ │ Input │
114+ /// │ [0] │ │ [1] │
115+ /// └────────────┘ └─────────────┘
116+ ///
117+ /// "build side" "probe side"
118+ /// ```
119+ ///
120+ /// Execution proceeds in 2 stages:
121+ ///
122+ /// 1. the **build phase** where a hash table is created from the tuples of the
123+ /// build side.
124+ ///
125+ /// 2. the **probe phase** where the tuples of the probe side are streamed
126+ /// through, checking for matches of the join keys in the hash table.
127+ ///
128+ /// ```text
129+ /// ┌────────────────┐ ┌────────────────┐
130+ /// │ ┌─────────┐ │ │ ┌─────────┐ │
131+ /// │ │ Hash │ │ │ │ Hash │ │
132+ /// │ │ Table │ │ │ │ Table │ │
133+ /// │ │(keys are│ │ │ │(keys are│ │
134+ /// │ │equi join│ │ │ │equi join│ │ Stage 2: batches from
135+ /// Stage 1: the │ │columns) │ │ │ │columns) │ │ the probe side are
136+ /// *entire* build │ │ │ │ │ │ │ │ streamed through, and
137+ /// side is read │ └─────────┘ │ │ └─────────┘ │ checked against the
138+ /// into the hash │ ▲ │ │ ▲ │ contents of the hash
139+ /// table │ HashJoin │ │ HashJoin │ table
140+ /// └──────┼─────────┘ └──────────┼─────┘
141+ /// ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
142+ /// │ │
143+ ///
144+ /// │ │
145+ /// ┌────────────┐ ┌────────────┐
146+ /// │RecordBatch │ │RecordBatch │
147+ /// └────────────┘ └────────────┘
148+ /// ┌────────────┐ ┌────────────┐
149+ /// │RecordBatch │ │RecordBatch │
150+ /// └────────────┘ └────────────┘
151+ /// ... ...
152+ /// ┌────────────┐ ┌────────────┐
153+ /// │RecordBatch │ │RecordBatch │
154+ /// └────────────┘ └────────────┘
155+ ///
156+ /// build side probe side
157+ ///
158+ /// ```
159+ ///
160+ /// # Example "Optimal" Plans
161+ ///
162+ /// The differences in the inputs means that for classic "Star Schema Query",
163+ /// the optimal plan will be a **"Right Deep Tree"** . A Star Schema Query is
164+ /// one where there is one large table and several smaller "dimension" tables,
165+ /// joined on `Foreign Key = Primary Key` predicates.
166+ ///
167+ /// A "Right Deep Tree" looks like this large table as the probe side on the
168+ /// lowest join:
169+ ///
170+ /// ```text
171+ /// ┌───────────┐
172+ /// │ HashJoin │
173+ /// │ │
174+ /// └───────────┘
175+ /// │ │
176+ /// ┌───────┘ └──────────┐
177+ /// ▼ ▼
178+ /// ┌───────────────┐ ┌───────────┐
179+ /// │ small table 1 │ │ HashJoin │
180+ /// │ "dimension" │ │ │
181+ /// └───────────────┘ └───┬───┬───┘
182+ /// ┌──────────┘ └───────┐
183+ /// │ │
184+ /// ▼ ▼
185+ /// ┌───────────────┐ ┌───────────┐
186+ /// │ small table 2 │ │ HashJoin │
187+ /// │ "dimension" │ │ │
188+ /// └───────────────┘ └───┬───┬───┘
189+ /// ┌────────┘ └────────┐
190+ /// │ │
191+ /// ▼ ▼
192+ /// ┌───────────────┐ ┌───────────────┐
193+ /// │ small table 3 │ │ large table │
194+ /// │ "dimension" │ │ "fact" │
195+ /// └───────────────┘ └───────────────┘
196+ /// ```
87197#[ derive( Debug ) ]
88198pub struct HashJoinExec {
89199 /// left (build) side which gets hashed
90200 pub left : Arc < dyn ExecutionPlan > ,
91201 /// right (probe) side which are filtered by the hash table
92202 pub right : Arc < dyn ExecutionPlan > ,
93- /// Set of common columns used to join on
203+ /// Set of equijoin columns from the relations: `(left_col, right_col)`
94204 pub on : Vec < ( Column , Column ) > ,
95205 /// Filters which are applied while finding matching rows
96206 pub filter : Option < JoinFilter > ,
97- /// How the join is performed
207+ /// How the join is performed (`OUTER`, `INNER`, etc)
98208 pub join_type : JoinType ,
99- /// The schema once the join is applied
209+ /// The output schema for the join
100210 schema : SchemaRef ,
101211 /// Build-side data
102212 left_fut : OnceAsync < JoinLeftData > ,
103- /// Shares the `RandomState` for the hashing algorithm
213+ /// Shared the `RandomState` for the hashing algorithm
104214 random_state : RandomState ,
105215 /// Output order
106216 output_order : Option < Vec < PhysicalSortExpr > > ,
@@ -110,12 +220,16 @@ pub struct HashJoinExec {
110220 metrics : ExecutionPlanMetricsSet ,
111221 /// Information of index and left / right placement of columns
112222 column_indices : Vec < ColumnIndex > ,
113- /// If null_equals_null is true, null == null else null != null
223+ /// Null matching behavior: If `null_equals_null` is true, rows that have
224+ /// `null`s in both left and right equijoin columns will be matched.
225+ /// Otherwise, rows that have `null`s in the join columns will not be
226+ /// matched and thus will not appear in the output.
114227 pub null_equals_null : bool ,
115228}
116229
117230impl HashJoinExec {
118231 /// Tries to create a new [HashJoinExec].
232+ ///
119233 /// # Error
120234 /// This function errors when it is not possible to join the left and right sides on keys `on`.
121235 pub fn try_new (
0 commit comments