Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
437 changes: 127 additions & 310 deletions Cargo.lock

Large diffs are not rendered by default.

14 changes: 5 additions & 9 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -336,11 +336,7 @@ prost-reflect = { workspace = true, optional = true }
prost-types = { workspace = true, optional = true }

# Databricks Zerobus
databricks-zerobus-ingest-sdk = { version = "2.0.1", optional = true }
# The SDK returns prost-types 0.14 DescriptorProto values; prost-reflect (used by the rest
# of the sink) is on prost-types 0.13. We bridge the two via wire-format re-encoding.
prost-014 = { package = "prost", version = "0.14", optional = true }
prost-types-014 = { package = "prost-types", version = "0.14", optional = true }
databricks-zerobus-ingest-sdk = { version = "2.0.1", optional = true, features = ["arrow-flight"] }

# GCP
goauth = { version = "0.16.0", optional = true }
Expand All @@ -361,9 +357,9 @@ greptimedb-ingester = { version = "0.17.0", default-features = false, optional =
# External libs
arc-swap = { workspace = true, default-features = false, optional = true }
async-compression = { version = "0.4.27", default-features = false, features = ["tokio", "gzip", "zstd"], optional = true }
arrow = { version = "56.2.0", default-features = false, features = ["ipc"], optional = true }
arrow-schema = { version = "56.2.0", default-features = false, optional = true }
parquet = { version = "56.2.0", default-features = false, features = [
arrow = { version = "58.2.0", default-features = false, features = ["ipc"], optional = true }
arrow-schema = { version = "58.2.0", default-features = false, optional = true }
parquet = { version = "58.2.0", default-features = false, features = [
"arrow",
"snap",
"zstd",
Expand Down Expand Up @@ -934,7 +930,7 @@ sinks-chronicle = []
sinks-clickhouse = ["dep:nom", "dep:rust_decimal", "codecs-arrow"]
sinks-console = []
sinks-databend = ["dep:databend-client"]
sinks-databricks-zerobus = ["dep:databricks-zerobus-ingest-sdk", "dep:prost-reflect", "dep:prost-014", "dep:prost-types-014", "dep:base64"]
sinks-databricks-zerobus = ["dep:databricks-zerobus-ingest-sdk", "codecs-arrow"]
sinks-datadog_events = []
sinks-datadog_logs = []
sinks-datadog_metrics = ["protobuf-build", "dep:prost", "dep:prost-reflect"]
Expand Down
3 changes: 2 additions & 1 deletion LICENSE-3rdparty.csv
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ arr_macro_impl,https://github.com/JoshMcguigan/arr_macro,MIT OR Apache-2.0,Josh
arrayvec,https://github.com/bluss/arrayvec,MIT OR Apache-2.0,bluss
arrow,https://github.com/apache/arrow-rs,Apache-2.0,Apache Arrow <dev@arrow.apache.org>
arrow-arith,https://github.com/apache/arrow-rs,Apache-2.0,Apache Arrow <dev@arrow.apache.org>
arrow-array,https://github.com/apache/arrow-rs,Apache-2.0,Apache Arrow <dev@arrow.apache.org>
arrow-array,https://github.com/apache/arrow-rs,Apache-2.0 AND MIT,Apache Arrow <dev@arrow.apache.org>
arrow-buffer,https://github.com/apache/arrow-rs,Apache-2.0,Apache Arrow <dev@arrow.apache.org>
arrow-cast,https://github.com/apache/arrow-rs,Apache-2.0,Apache Arrow <dev@arrow.apache.org>
arrow-csv,https://github.com/apache/arrow-rs,Apache-2.0,Apache Arrow <dev@arrow.apache.org>
Expand Down Expand Up @@ -316,6 +316,7 @@ half,https://github.com/starkat99/half-rs,MIT OR Apache-2.0,Kathryn Long <squees
hash_hasher,https://github.com/Fraser999/Hash-Hasher,Apache-2.0 OR MIT,Fraser Hutchison <fraser@astria.org>
hashbag,https://github.com/jonhoo/hashbag,MIT OR Apache-2.0,Jon Gjengset <jon@thesquareplanet.com>
hashbrown,https://github.com/rust-lang/hashbrown,MIT OR Apache-2.0,Amanieu d'Antras <amanieu@gmail.com>
hashbrown,https://github.com/rust-lang/hashbrown,MIT OR Apache-2.0,The hashbrown Authors
hashlink,https://github.com/kyren/hashlink,MIT OR Apache-2.0,kyren <kerriganw@gmail.com>
hdrhistogram,https://github.com/HdrHistogram/HdrHistogram_rust,MIT OR Apache-2.0,"Jon Gjengset <jon@thesquareplanet.com>, Marshall Pierce <marshall@mpierce.org>"
headers,https://github.com/hyperium/headers,MIT,Sean McArthur <sean@seanmonstar.com>
Expand Down
2 changes: 1 addition & 1 deletion changelog.d/24840_databricks_zerobus_sink.feature.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Add a new `databricks_zerobus` sink that streams log data to Databricks Unity Catalog tables via the Zerobus ingestion service. Supports OAuth 2.0 authentication, automatic schema fetching from Unity Catalog, and protobuf batch encoding.
Add a new `databricks_zerobus` sink that streams log data to Databricks Unity Catalog tables via the Zerobus ingestion service. Supports OAuth 2.0 authentication, automatic schema fetching from Unity Catalog, and Arrow batch encoding.

authors: flaviocruz
3 changes: 3 additions & 0 deletions changelog.d/arrow_58.enhancement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Updated the bundled Apache Arrow and Parquet libraries from 56.2 to 58. This affects components that emit Arrow or Parquet data, such as the `clickhouse` sink and the `aws_s3` sink's Parquet encoding.

authors: flaviocruz
4 changes: 2 additions & 2 deletions lib/codecs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ path = "tests/bin/generate-avro-fixtures.rs"

[dependencies]
apache-avro = { version = "0.20.0", default-features = false }
arrow = { version = "56.2.0", default-features = false, features = ["ipc", "json"], optional = true }
parquet = { version = "56.2.0", default-features = false, features = [
arrow = { version = "58.2.0", default-features = false, features = ["ipc", "json"], optional = true }
parquet = { version = "58.2.0", default-features = false, features = [
"arrow",
"snap",
"zstd",
Expand Down
40 changes: 11 additions & 29 deletions lib/codecs/src/encoding/encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,43 +8,42 @@ use crate::encoding::ArrowStreamSerializer;
#[cfg(feature = "parquet")]
use crate::encoding::ParquetSerializer;
use crate::{
encoding::{Error, Framer, ProtoBatchSerializer, Serializer},
encoding::{Error, Framer, Serializer},
internal_events::{EncoderFramingError, EncoderSerializeError},
};

/// The output of a batch encoding operation.
///
/// Different batch serializers produce different output types:
/// - Arrow serializer produces a `RecordBatch`
/// - Proto serializer produces individual byte buffers per event
/// Only available when the `arrow` feature is enabled.
#[cfg(feature = "arrow")]
#[derive(Debug)]
pub enum BatchOutput {
/// An Arrow RecordBatch containing all events encoded as columnar data.
#[cfg(feature = "arrow")]
Arrow(arrow::record_batch::RecordBatch),
/// A list of individually-serialized records (one per event).
Records(Vec<Vec<u8>>),
}

/// Serializers that support batch encoding (encoding all events at once).
///
/// Only available when the `arrow` feature is enabled (the `parquet` feature
/// implies `arrow`).
#[cfg(feature = "arrow")]
#[derive(Debug, Clone)]
pub enum BatchSerializer {
/// Arrow IPC stream format serializer.
#[cfg(feature = "arrow")]
Arrow(ArrowStreamSerializer),
/// Parquet format serializer.
#[cfg(feature = "parquet")]
Parquet(Box<ParquetSerializer>),
/// Protobuf batch serializer that encodes each event individually.
ProtoBatch(ProtoBatchSerializer),
}

/// An encoder that encodes batches of events.
#[cfg(feature = "arrow")]
#[derive(Debug, Clone)]
pub struct BatchEncoder {
serializer: BatchSerializer,
}

#[cfg(feature = "arrow")]
impl BatchEncoder {
/// Creates a new `BatchEncoder` with the specified batch serializer.
pub const fn new(serializer: BatchSerializer) -> Self {
Expand All @@ -57,25 +56,17 @@ impl BatchEncoder {
}

/// Get the HTTP content type.
///
/// Returns `None` for serializers that do not produce a single HTTP body
/// (e.g. `ProtoBatch`, which emits one record per event for an out-of-band
/// transport rather than an HTTP payload).
#[cfg(any(feature = "arrow", feature = "parquet"))]
pub const fn content_type(&self) -> Option<&'static str> {
match &self.serializer {
#[cfg(feature = "arrow")]
BatchSerializer::Arrow(_) => Some("application/vnd.apache.arrow.stream"),
#[cfg(feature = "parquet")]
BatchSerializer::Parquet(_) => Some("application/vnd.apache.parquet"),
BatchSerializer::ProtoBatch(_) => None,
}
}

/// Encode a batch of events into a `BatchOutput`.
pub fn encode_batch(&self, events: &[Event]) -> Result<BatchOutput, Error> {
match &self.serializer {
#[cfg(feature = "arrow")]
BatchSerializer::Arrow(serializer) => {
let record_batch = serializer.encode_to_record_batch(events).map_err(|err| {
use crate::encoding::ArrowEncodingError;
Expand All @@ -88,12 +79,6 @@ impl BatchEncoder {
})?;
Ok(BatchOutput::Arrow(record_batch))
}
BatchSerializer::ProtoBatch(serializer) => {
let records = serializer
.encode_batch(events)
.map_err(|err| Error::SerializingError(Box::new(err)))?;
Ok(BatchOutput::Records(records))
}
#[cfg(feature = "parquet")]
BatchSerializer::Parquet(_) => Err(Error::SerializingError(Box::from(
"Parquet serializer does not support encode_batch; use the tokio Encoder interface instead",
Expand All @@ -102,13 +87,12 @@ impl BatchEncoder {
}
}

#[cfg(feature = "arrow")]
impl tokio_util::codec::Encoder<Vec<Event>> for BatchEncoder {
type Error = Error;

#[allow(unused_variables)]
fn encode(&mut self, events: Vec<Event>, buffer: &mut BytesMut) -> Result<(), Self::Error> {
match &mut self.serializer {
#[cfg(feature = "arrow")]
BatchSerializer::Arrow(serializer) => {
serializer.encode(events, buffer).map_err(|err| {
use crate::encoding::ArrowEncodingError;
Expand All @@ -124,9 +108,6 @@ impl tokio_util::codec::Encoder<Vec<Event>> for BatchEncoder {
BatchSerializer::Parquet(serializer) => serializer
.encode(events, buffer)
.map_err(Error::SerializingError),
BatchSerializer::ProtoBatch(_) => Err(Error::SerializingError(Box::from(
"ProtoBatch serializer does not support the tokio Encoder interface; use BatchEncoder::encode_batch() instead",
))),
}
}
}
Expand All @@ -137,6 +118,7 @@ pub enum EncoderKind {
/// Uses framing to encode individual events
Framed(Box<Encoder<Framer>>),
/// Encodes events in batches without framing
#[cfg(feature = "arrow")]
Batch(BatchEncoder),
}

Expand Down
2 changes: 0 additions & 2 deletions lib/codecs/src/encoding/format/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ mod native_json;
mod otlp;
#[cfg(feature = "parquet")]
mod parquet;
mod proto_batch;
mod protobuf;
mod raw_message;
#[cfg(feature = "syslog")]
Expand Down Expand Up @@ -46,7 +45,6 @@ pub use native::{NativeSerializer, NativeSerializerConfig};
pub use native_json::{NativeJsonSerializer, NativeJsonSerializerConfig};
#[cfg(feature = "opentelemetry")]
pub use otlp::{OtlpSerializer, OtlpSerializerConfig};
pub use proto_batch::{ProtoBatchEncodingError, ProtoBatchSerializer, ProtoBatchSerializerConfig};
pub use protobuf::{ProtobufSerializer, ProtobufSerializerConfig, ProtobufSerializerOptions};
pub use raw_message::{RawMessageSerializer, RawMessageSerializerConfig};
#[cfg(feature = "syslog")]
Expand Down
Loading
Loading