Skip to content

Commit 62cd75e

Browse files
authored
Merge pull request #39 from poyrazK/feature/full-join-option-c-left-tracking
feat: track unmatched LEFT tuples for FULL JOIN Phase 3-5 collection
2 parents 5fba9c0 + 2b0bf75 commit 62cd75e

3 files changed

Lines changed: 127 additions & 2 deletions

File tree

include/executor/operator.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,10 @@ class HashJoinOperator : public Operator {
343343
/* Final phase for RIGHT/FULL joins */
344344
std::optional<std::unordered_multimap<std::string, BuildTuple>::iterator> right_idx_iter_;
345345

346+
/* Storage for unmatched LEFT tuples (for FULL JOIN distributed collection) */
347+
std::vector<Tuple> unmatched_left_rows_;
348+
std::vector<std::string> unmatched_left_keys_;
349+
346350
public:
347351
HashJoinOperator(std::unique_ptr<Operator> left, std::unique_ptr<Operator> right,
348352
std::unique_ptr<parser::Expression> left_key,
@@ -370,6 +374,18 @@ class HashJoinOperator : public Operator {
370374
* @return Vector of strings - the join key values for unmatched right rows
371375
*/
372376
[[nodiscard]] std::vector<std::string> get_unmatched_right_keys() const;
377+
378+
/**
379+
* @brief Get unmatched left rows after join execution
380+
* @return Vector of tuples - the left-side rows that had no match
381+
*/
382+
[[nodiscard]] std::vector<Tuple> get_unmatched_left_rows() const;
383+
384+
/**
385+
* @brief Get join key values of unmatched left rows
386+
* @return Vector of strings - the join key values for unmatched left rows
387+
*/
388+
[[nodiscard]] std::vector<std::string> get_unmatched_left_keys() const;
373389
};
374390

375391
/**

src/executor/operator.cpp

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -759,6 +759,7 @@ bool HashJoinOperator::open() {
759759
bool HashJoinOperator::next(Tuple& out_tuple) {
760760
auto left_schema = left_->output_schema();
761761
auto right_schema = right_->output_schema();
762+
std::string current_left_key_str;
762763

763764
while (true) {
764765
if (match_iter_.has_value()) {
@@ -784,6 +785,10 @@ bool HashJoinOperator::next(Tuple& out_tuple) {
784785
match_iter_ = std::nullopt;
785786
if ((join_type_ == JoinType::Left || join_type_ == JoinType::Full) &&
786787
!left_had_match_) {
788+
/* Store unmatched left tuple and key for Phase 3-5 collection (FULL JOIN) */
789+
unmatched_left_keys_.push_back(current_left_key_str);
790+
unmatched_left_rows_.push_back(Tuple(*left_tuple_));
791+
787792
std::pmr::vector<common::Value> joined_values(left_tuple_->values().begin(),
788793
left_tuple_->values().end(),
789794
get_memory_resource());
@@ -802,15 +807,22 @@ bool HashJoinOperator::next(Tuple& out_tuple) {
802807
if (left_->next(next_left)) {
803808
left_tuple_ = std::move(next_left);
804809
left_had_match_ = false;
810+
}
811+
812+
if (left_tuple_.has_value()) {
805813
const common::Value key =
806814
left_key_->evaluate(&(left_tuple_.value()), &left_schema, get_params());
815+
current_left_key_str = key.to_string();
816+
auto range = hash_table_.equal_range(current_left_key_str);
807817

808-
/* Look up in hash table */
809-
auto range = hash_table_.equal_range(key.to_string());
810818
if (range.first != range.second) {
811819
match_iter_ = {range.first, range.second};
812820
} else if (join_type_ == JoinType::Left || join_type_ == JoinType::Full) {
813821
/* No match found immediately, emit NULLs if Left/Full join */
822+
/* Store unmatched left tuple and key for Phase 3-5 collection (FULL JOIN) */
823+
unmatched_left_keys_.push_back(current_left_key_str);
824+
unmatched_left_rows_.push_back(Tuple(*left_tuple_));
825+
814826
std::pmr::vector<common::Value> joined_values(left_tuple_->values().begin(),
815827
left_tuple_->values().end(),
816828
get_memory_resource());
@@ -862,6 +874,8 @@ void HashJoinOperator::close() {
862874
hash_table_.clear();
863875
match_iter_ = std::nullopt;
864876
left_tuple_ = std::nullopt;
877+
unmatched_left_rows_.clear();
878+
unmatched_left_keys_.clear();
865879
set_state(ExecState::Done);
866880
}
867881

@@ -912,6 +926,14 @@ std::vector<std::string> HashJoinOperator::get_unmatched_right_keys() const {
912926
return keys;
913927
}
914928

929+
std::vector<Tuple> HashJoinOperator::get_unmatched_left_rows() const {
930+
return unmatched_left_rows_;
931+
}
932+
933+
std::vector<std::string> HashJoinOperator::get_unmatched_left_keys() const {
934+
return unmatched_left_keys_;
935+
}
936+
915937
/* --- LimitOperator --- */
916938

917939
LimitOperator::LimitOperator(std::unique_ptr<Operator> child, int64_t limit, int64_t offset)

tests/operator_tests.cpp

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,93 @@ TEST_F(OperatorTests, HashJoinLeft) {
516516
join->close();
517517
}
518518

519+
TEST_F(OperatorTests, HashJoinLeftUnmatchedCollection) {
520+
// Test that get_unmatched_left_rows/keys correctly tracks unmatched left tuples
521+
// Left table: values 1, 2, 3 (only 2 has a match)
522+
Schema left_schema = make_schema({{"id", common::ValueType::TYPE_INT64}});
523+
std::vector<Tuple> left_data;
524+
left_data.push_back(make_tuple({common::Value::make_int64(1)})); // no match
525+
left_data.push_back(make_tuple({common::Value::make_int64(2)})); // matches
526+
left_data.push_back(make_tuple({common::Value::make_int64(3)})); // no match
527+
528+
// Right table: values 2, 4
529+
Schema right_schema = make_schema({{"id", common::ValueType::TYPE_INT64}});
530+
std::vector<Tuple> right_data;
531+
right_data.push_back(make_tuple({common::Value::make_int64(2)}));
532+
right_data.push_back(make_tuple({common::Value::make_int64(4)}));
533+
534+
auto left_scan = make_buffer_scan("left_table", left_data, left_schema);
535+
auto right_scan = make_buffer_scan("right_table", right_data, right_schema);
536+
537+
auto join = make_hash_join(std::move(left_scan), std::move(right_scan), col_expr("id"),
538+
col_expr("id"), JoinType::Left);
539+
540+
ASSERT_TRUE(join->init());
541+
ASSERT_TRUE(join->open());
542+
543+
// Consume all join results
544+
Tuple tuple;
545+
while (join->next(tuple)) {
546+
}
547+
548+
// After join completes, verify unmatched left tracking
549+
auto unmatched_rows = join->get_unmatched_left_rows();
550+
auto unmatched_keys = join->get_unmatched_left_keys();
551+
552+
// We expect 2 unmatched left tuples: id=1 and id=3
553+
EXPECT_EQ(unmatched_rows.size(), 2U);
554+
EXPECT_EQ(unmatched_keys.size(), 2U);
555+
556+
// Keys should be "1" and "3" (to_string of int64)
557+
EXPECT_EQ(unmatched_keys[0], "1");
558+
EXPECT_EQ(unmatched_keys[1], "3");
559+
560+
// Check the actual tuple values
561+
EXPECT_EQ(unmatched_rows[0].get(0).to_int64(), 1);
562+
EXPECT_EQ(unmatched_rows[1].get(0).to_int64(), 3);
563+
564+
join->close();
565+
}
566+
567+
TEST_F(OperatorTests, HashJoinFullUnmatchedLeftCollection) {
568+
// Test LEFT unmatched collection for FULL join
569+
// Similar to LEFT join but tests the FULL join path
570+
Schema left_schema = make_schema({{"id", common::ValueType::TYPE_INT64}});
571+
std::vector<Tuple> left_data;
572+
left_data.push_back(make_tuple({common::Value::make_int64(1)})); // no match
573+
left_data.push_back(make_tuple({common::Value::make_int64(2)})); // matches
574+
575+
Schema right_schema = make_schema({{"id", common::ValueType::TYPE_INT64}});
576+
std::vector<Tuple> right_data;
577+
right_data.push_back(make_tuple({common::Value::make_int64(2)}));
578+
right_data.push_back(make_tuple({common::Value::make_int64(3)})); // no match
579+
580+
auto left_scan = make_buffer_scan("left_table", left_data, left_schema);
581+
auto right_scan = make_buffer_scan("right_table", right_data, right_schema);
582+
583+
auto join = make_hash_join(std::move(left_scan), std::move(right_scan), col_expr("id"),
584+
col_expr("id"), JoinType::Full);
585+
586+
ASSERT_TRUE(join->init());
587+
ASSERT_TRUE(join->open());
588+
589+
// Consume all join results
590+
Tuple tuple;
591+
while (join->next(tuple)) {
592+
}
593+
594+
// For FULL join, we should track unmatched LEFT tuples
595+
// Note: RIGHT unmatched tuples are emitted during right scan phase and marked matched,
596+
// so get_unmatched_right_keys() won't include them (they're already "accounted for")
597+
auto unmatched_left_keys = join->get_unmatched_left_keys();
598+
599+
// Left unmatched: id=1
600+
EXPECT_EQ(unmatched_left_keys.size(), 1U);
601+
EXPECT_EQ(unmatched_left_keys[0], "1");
602+
603+
join->close();
604+
}
605+
519606
TEST_F(OperatorTests, HashJoinEmpty) {
520607
// Left has data
521608
Schema left_schema = make_schema({{"id", common::ValueType::TYPE_INT64}});

0 commit comments

Comments
 (0)