From abd2b9c01cb1b5f8c801fd00936e5b566efd8e98 Mon Sep 17 00:00:00 2001 From: RobertLD Date: Thu, 16 Apr 2026 08:48:45 -0400 Subject: [PATCH 1/6] #33823 Improve err msgs when opening files that are the wrong format --- cpp/src/arrow/ipc/message.cc | 12 ++++++++++++ cpp/src/arrow/ipc/reader.cc | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 84ee62fe9e8d..1bcf20933ea1 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -565,6 +565,18 @@ Status DecodeMessage(MessageDecoder* decoder, io::InputStream* file) { auto metadata_length = decoder->next_required_size(); ARROW_ASSIGN_OR_RAISE(auto metadata, file->Read(metadata_length)); if (metadata->size() != metadata_length) { + // The first sizeof(int32_t) bytes of the Arrow file magic ("ARRO") may have been + // misread as metadata_length. Check if the remaining bytes complete the magic. + const auto remaining_magic = internal::kArrowMagicBytes.substr(sizeof(int32_t)); + if (metadata->size() >= static_cast(remaining_magic.size()) && + std::string_view(reinterpret_cast(metadata->data()), + remaining_magic.size()) == remaining_magic) { + return Status::Invalid( + "Expected to read ", metadata_length, " metadata bytes, but only read ", + metadata->size(), + ". This appears to be an Arrow IPC File format file. " + "Try open_file() instead of open_stream()."); + } return Status::Invalid("Expected to read ", metadata_length, " metadata bytes, but ", "only read ", metadata->size()); } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 580081384308..317e57ba16c6 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -1890,7 +1890,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { const auto magic_start = buffer->data() + sizeof(int32_t); if (std::string_view(reinterpret_cast(magic_start), kMagicSize) != kArrowMagicBytes) { - return Status::Invalid("Not an Arrow file"); + return Status::Invalid("Not an Arrow file. If this is an Arrow IPC Streaming format file, try open_stream() instead."); } int32_t footer_length = bit_util::FromLittleEndian( From db7dd635e3376d0996439f941b4ad154bd4708ca Mon Sep 17 00:00:00 2001 From: RobertLD Date: Thu, 16 Apr 2026 08:52:41 -0400 Subject: [PATCH 2/6] 33823 Formatting + linting --- cpp/src/arrow/ipc/message.cc | 9 ++++----- cpp/src/arrow/ipc/reader.cc | 4 +++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 1bcf20933ea1..285c7b1e5674 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -571,11 +571,10 @@ Status DecodeMessage(MessageDecoder* decoder, io::InputStream* file) { if (metadata->size() >= static_cast(remaining_magic.size()) && std::string_view(reinterpret_cast(metadata->data()), remaining_magic.size()) == remaining_magic) { - return Status::Invalid( - "Expected to read ", metadata_length, " metadata bytes, but only read ", - metadata->size(), - ". This appears to be an Arrow IPC File format file. " - "Try open_file() instead of open_stream()."); + return Status::Invalid("Expected to read ", metadata_length, + " metadata bytes, but only read ", metadata->size(), + ". This appears to be an Arrow IPC File format file. " + "Try open_file() instead of open_stream()."); } return Status::Invalid("Expected to read ", metadata_length, " metadata bytes, but ", "only read ", metadata->size()); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 317e57ba16c6..ead4e50672a6 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -1890,7 +1890,9 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { const auto magic_start = buffer->data() + sizeof(int32_t); if (std::string_view(reinterpret_cast(magic_start), kMagicSize) != kArrowMagicBytes) { - return Status::Invalid("Not an Arrow file. If this is an Arrow IPC Streaming format file, try open_stream() instead."); + return Status::Invalid( + "Not an Arrow file. If this is an Arrow IPC Streaming format file, try " + "open_stream() instead."); } int32_t footer_length = bit_util::FromLittleEndian( From d025ab01eb4ee83acd3b55ffc8b7490ee239ebab Mon Sep 17 00:00:00 2001 From: Robert DeRienzo Date: Fri, 24 Apr 2026 10:41:06 -0400 Subject: [PATCH 3/6] Update cpp/src/arrow/ipc/reader.cc Create more generic message instead of referencing c++ interface Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- cpp/src/arrow/ipc/reader.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index ead4e50672a6..4ac263c7a0d4 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -1891,8 +1891,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { if (std::string_view(reinterpret_cast(magic_start), kMagicSize) != kArrowMagicBytes) { return Status::Invalid( - "Not an Arrow file. If this is an Arrow IPC Streaming format file, try " - "open_stream() instead."); + "Not an Arrow file. If this is an Arrow IPC streaming format file, use " + "the IPC stream reader instead."); } int32_t footer_length = bit_util::FromLittleEndian( From 1d9b536ba14fe96e47eeaeb6b1b4d5f0a0954f88 Mon Sep 17 00:00:00 2001 From: Robert DeRienzo Date: Fri, 24 Apr 2026 10:41:39 -0400 Subject: [PATCH 4/6] Update cpp/src/arrow/ipc/message.cc Similarly, keep messages more generic and not referencing the c++ interface Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- cpp/src/arrow/ipc/message.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 285c7b1e5674..2ad4e3b6303d 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -573,8 +573,8 @@ Status DecodeMessage(MessageDecoder* decoder, io::InputStream* file) { remaining_magic.size()) == remaining_magic) { return Status::Invalid("Expected to read ", metadata_length, " metadata bytes, but only read ", metadata->size(), - ". This appears to be an Arrow IPC File format file. " - "Try open_file() instead of open_stream()."); + ". This appears to be an Arrow IPC file. " + "Try the IPC file reader instead of the IPC stream reader."); } return Status::Invalid("Expected to read ", metadata_length, " metadata bytes, but ", "only read ", metadata->size()); From 407c13436c3f50224f9e1f32e14514d42db46634 Mon Sep 17 00:00:00 2001 From: RobertLD Date: Wed, 29 Apr 2026 18:56:58 -0400 Subject: [PATCH 5/6] 33823 Add tests to verify new err msgs We want to ensure we're providing advice to the user that is correct/actionable. --- cpp/src/arrow/ipc/read_write_test.cc | 46 ++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 15cf0258b2ee..cec7fe131a35 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -2265,6 +2265,52 @@ TEST(TestRecordBatchStreamReader, MalformedInput) { ASSERT_RAISES(Invalid, RecordBatchStreamReader::Open(&garbage_reader)); } +TEST(TestRecordBatchStreamReader, OpenFileFormatSuggestsFileReader) { + std::shared_ptr batch; + ASSERT_OK(MakeIntRecordBatch(&batch)); + + FileWriterHelper helper; + ASSERT_OK(helper.Init(batch->schema(), IpcWriteOptions::Defaults())); + ASSERT_OK(helper.WriteBatch(batch)); + ASSERT_OK(helper.Finish()); + + io::BufferReader reader(helper.buffer_); + // Check we mention using the file_reader when we detect file format + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("file reader"), + RecordBatchStreamReader::Open(&reader)); +} + +TEST(TestRecordBatchStreamReader, CorruptDataDoesNotSuggestFileReader) { + // Continuation marker + metadata_length = 100, then 8 bytes of non-magic data. + const std::string corrupt( + "\xff\xff\xff\xff" + "\x64\x00\x00\x00" + "ABABABAB", + 16); + auto buffer = std::make_shared(corrupt); + io::BufferReader reader(buffer); + // Validate that we don't suggest file reader when file is just corrupt + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::Not(::testing::HasSubstr("file reader")), + RecordBatchStreamReader::Open(&reader)); +} + +TEST(TestRecordBatchFileReader, OpenStreamFormatSuggestsStreamReader) { + std::shared_ptr batch; + ASSERT_OK(MakeIntRecordBatch(&batch)); + + StreamWriterHelper helper; + ASSERT_OK(helper.Init(batch->schema(), IpcWriteOptions::Defaults())); + ASSERT_OK(helper.WriteBatch(batch)); + ASSERT_OK(helper.Finish()); + + auto buf_reader = std::make_shared(helper.buffer_); + // Check we mention using the stream_reader when we detect stream format + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("stream reader"), + RecordBatchFileReader::Open(buf_reader.get(), helper.buffer_->size())); +} + class EndlessCollectListener : public CollectListener { public: EndlessCollectListener() : CollectListener(), decoder_(nullptr) {} From d01cb5d5b73abe0a90472f6890cd65dc1e13e91c Mon Sep 17 00:00:00 2001 From: Robert DeRienzo Date: Tue, 12 May 2026 15:58:01 +0000 Subject: [PATCH 6/6] 33823 PR suggestions Fix endianness bug, avoid huge metadata reads and alter err messages --- cpp/src/arrow/ipc/message.cc | 29 +++++++++++++++++----------- cpp/src/arrow/ipc/read_write_test.cc | 7 ++++--- cpp/src/arrow/ipc/reader.cc | 2 +- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 2ad4e3b6303d..fc7c2baec519 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -563,19 +563,26 @@ Status DecodeMessage(MessageDecoder* decoder, io::InputStream* file) { } auto metadata_length = decoder->next_required_size(); + + // "ARRO" (first 4 bytes of kArrowMagicBytes) as little-endian int32. + constexpr int32_t kArrowMagicPrefix = 0x4F525241; + + // Did we misinterpret the metadata as a length? + if (metadata_length == kArrowMagicPrefix) { + constexpr std::string_view kRemainingMagic = + internal::kArrowMagicBytes.substr(sizeof(int32_t)); + ARROW_ASSIGN_OR_RAISE(auto peek, file->Read(kRemainingMagic.size())); + if (peek->size() >= static_cast(kRemainingMagic.size()) && + std::string_view(reinterpret_cast(peek->data()), + kRemainingMagic.size()) == kRemainingMagic) { + return Status::Invalid( + "This appears to be an Arrow IPC file. " + "Try the IPC file reader instead of the IPC stream reader."); + } + } ARROW_ASSIGN_OR_RAISE(auto metadata, file->Read(metadata_length)); + if (metadata->size() != metadata_length) { - // The first sizeof(int32_t) bytes of the Arrow file magic ("ARRO") may have been - // misread as metadata_length. Check if the remaining bytes complete the magic. - const auto remaining_magic = internal::kArrowMagicBytes.substr(sizeof(int32_t)); - if (metadata->size() >= static_cast(remaining_magic.size()) && - std::string_view(reinterpret_cast(metadata->data()), - remaining_magic.size()) == remaining_magic) { - return Status::Invalid("Expected to read ", metadata_length, - " metadata bytes, but only read ", metadata->size(), - ". This appears to be an Arrow IPC file. " - "Try the IPC file reader instead of the IPC stream reader."); - } return Status::Invalid("Expected to read ", metadata_length, " metadata bytes, but ", "only read ", metadata->size()); } diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index cec7fe131a35..7a7c13093f80 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -2276,7 +2276,8 @@ TEST(TestRecordBatchStreamReader, OpenFileFormatSuggestsFileReader) { io::BufferReader reader(helper.buffer_); // Check we mention using the file_reader when we detect file format - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("file reader"), + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + ::testing::HasSubstr("Try the IPC file reader"), RecordBatchStreamReader::Open(&reader)); } @@ -2291,7 +2292,7 @@ TEST(TestRecordBatchStreamReader, CorruptDataDoesNotSuggestFileReader) { io::BufferReader reader(buffer); // Validate that we don't suggest file reader when file is just corrupt EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::Not(::testing::HasSubstr("file reader")), + Invalid, ::testing::Not(::testing::HasSubstr("Try the IPC file reader")), RecordBatchStreamReader::Open(&reader)); } @@ -2307,7 +2308,7 @@ TEST(TestRecordBatchFileReader, OpenStreamFormatSuggestsStreamReader) { auto buf_reader = std::make_shared(helper.buffer_); // Check we mention using the stream_reader when we detect stream format EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("stream reader"), + Invalid, ::testing::HasSubstr("use the IPC stream reader"), RecordBatchFileReader::Open(buf_reader.get(), helper.buffer_->size())); } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 4ac263c7a0d4..68b0d1068c2c 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -1891,7 +1891,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { if (std::string_view(reinterpret_cast(magic_start), kMagicSize) != kArrowMagicBytes) { return Status::Invalid( - "Not an Arrow file. If this is an Arrow IPC streaming format file, use " + "Not an Arrow file. If this is an Arrow IPC stream, use " "the IPC stream reader instead."); }