kalamstack
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/crates/kalamdb-api/src/http/auth/models/login_request.rs‎
Lines changed: 1 addition & 1 deletion b/‎backend/crates/kalamdb-api/src/http/auth/models/login_request.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/crates/kalamdb-api/src/ws/compression.rs‎
Lines changed: 19 additions & 5 deletions b/‎backend/crates/kalamdb-api/src/ws/compression.rs‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎backend/crates/kalamdb-api/src/ws/events/mod.rs‎
Lines changed: 21 additions & 6 deletions b/‎backend/crates/kalamdb-api/src/ws/events/mod.rs‎
Lines changed: 21 additions & 6 deletions
diff --git a/‎backend/crates/kalamdb-commons/Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎backend/crates/kalamdb-commons/Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/crates/kalamdb-commons/src/websocket.rs‎
Lines changed: 15 additions & 3 deletions b/‎backend/crates/kalamdb-commons/src/websocket.rs‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎backend/crates/kalamdb-live/src/helpers/initial_data.rs‎
Lines changed: 35 additions & 3 deletions b/‎backend/crates/kalamdb-live/src/helpers/initial_data.rs‎
Lines changed: 35 additions & 3 deletions
diff --git a/‎backend/crates/kalamdb-live/src/models/connection.rs‎
Lines changed: 2 additions & 1 deletion b/‎backend/crates/kalamdb-live/src/models/connection.rs‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backend/crates/kalamdb-live/src/notification.rs‎
Lines changed: 55 additions & 31 deletions b/‎backend/crates/kalamdb-live/src/notification.rs‎
Lines changed: 55 additions & 31 deletions
@@ -126,3 +126,4 @@ link/sdks/typescript/client/.npmrc
 /link/kalam-client/sdks/typescript/client/.wasm-cargo-home-size-current2
 /link/kalam-client/sdks/typescript/client/.wasm-target-size-current
 /link/kalam-client/sdks/typescript/client/.wasm-target-size-current2
+/benchv2/logs
@@ -106,7 +106,7 @@ arrow-ipc = { version = "58.1.0", default-features = false }
 arrow-schema = { version = "58.1.0" }
 datafusion = { version = "53.1.0", default-features = false, features = ["sql", "parquet", "recursive_protection", "nested_expressions"] }
 datafusion-datasource = { version = "53.1.0", default-features = false }
-datafusion-common = { version = "53.1.0" }
+datafusion-common = { version = "53.1.0", default-features = false }
 datafusion-expr = { version = "53.1.0" }
 datafusion-functions-json = { version = "0.53.0" }
 sqlparser = { version = "0.61.0" }
 
@@ -12,7 +12,7 @@ const MAX_PASSWORD_LENGTH: usize = 256;
 #[serde(deny_unknown_fields)]
 pub struct LoginRequest {
     /// Canonical user identifier for authentication
-    #[serde(deserialize_with = "validate_user_length")]
+    #[serde(alias = "username", deserialize_with = "validate_user_length")]
     pub user: String,
     /// Password for authentication
     #[serde(deserialize_with = "validate_password_length")]
 
@@ -2,6 +2,12 @@
 //!
 //! Provides gzip compression for WebSocket messages to reduce bandwidth.
 //! Compression is enabled by default for all messages over the threshold.
+//!
+//! Performance: pre-sizes the output buffer to a fraction of the input so
+//! the encoder doesn't grow through multiple doublings on large payloads.
+//! actix-ws consumes owned `Vec<u8>`, so a single allocation per message is
+//! unavoidable without a per-connection write pipeline; this keeps it to
+//! exactly one modestly-sized allocation rather than several doublings.
 
 use flate2::write::GzEncoder;
 use flate2::Compression;
@@ -11,20 +17,28 @@ use std::io::Write;
 /// Messages smaller than this are sent uncompressed
 pub const COMPRESSION_THRESHOLD: usize = 512;
 
-/// Compress data using gzip
+/// Heuristic initial capacity for gzip output. Real-world JSON payloads
+/// compress to 20–40% of original; pre-sizing to 1/3 avoids 2–3
+/// `Vec::grow` reallocations inside the encoder on the hot path.
+#[inline]
+fn gzip_initial_capacity(input_len: usize) -> usize {
+    // Minimum gzip header + footer overhead is ~20 bytes.
+    (input_len / 3).max(64)
+}
+
+/// Compress data using gzip.
 ///
-/// Returns compressed bytes on success, or the original data if compression fails
+/// Returns compressed bytes on success, or the original data if compression fails.
 pub fn compress_gzip(data: &[u8]) -> Vec<u8> {
-    let mut encoder = GzEncoder::new(Vec::new(), Compression::fast());
+    let buf = Vec::with_capacity(gzip_initial_capacity(data.len()));
+    let mut encoder = GzEncoder::new(buf, Compression::fast());
     if encoder.write_all(data).is_ok() {
         if let Ok(compressed) = encoder.finish() {
-            // Only use compressed if it's actually smaller
             if compressed.len() < data.len() {
                 return compressed;
             }
         }
     }
-    // Fallback to original data
     data.to_vec()
 }
 
 
@@ -122,9 +122,21 @@ pub async fn send_message<T: serde::Serialize>(
 /// sent as binary frames.  When `false`, the raw payload is always sent as a
 /// text frame, which is easier to inspect during development.
 async fn send_data(session: &mut Session, data: &[u8], compress: bool) -> Result<(), ()> {
+    // Fast path: no compression — send as Text frame directly without a
+    // UTF-8 round-trip. Callers only reach this path with bytes that they
+    // just produced from `serde_json`/`rmp_serde`, so they are already valid
+    // UTF-8 when `compress == false` and serialization chose the text branch.
     if !compress {
-        let text = String::from_utf8_lossy(data);
-        return session.text(text.into_owned()).await.map_err(|_| ());
+        // `String::from_utf8_lossy(..).into_owned()` previously allocated a
+        // fresh String and scanned every byte even for known-valid JSON. Use
+        // `from_utf8` and fall back to lossy only on the (never-observed)
+        // error path to stay defensive without paying the cost on the hot
+        // path.
+        let owned = match std::str::from_utf8(data) {
+            Ok(s) => s.to_owned(),
+            Err(_) => String::from_utf8_lossy(data).into_owned(),
+        };
+        return session.text(owned).await.map_err(|_| ());
     }
 
     let (payload, compressed) = maybe_compress(data);
@@ -133,10 +145,13 @@ async fn send_data(session: &mut Session, data: &[u8], compress: bool) -> Result
         // Send compressed data as binary frame
         session.binary(payload).await.map_err(|_| ())
     } else {
-        // Send uncompressed data as text frame
-        // Safe to convert since original data was valid JSON string
-        let text = String::from_utf8_lossy(&payload);
-        session.text(text.into_owned()).await.map_err(|_| ())
+        // Send uncompressed data as text frame. `maybe_compress` returned the
+        // original bytes unchanged, so they remain valid UTF-8 JSON.
+        let owned = match std::str::from_utf8(&payload) {
+            Ok(s) => s.to_owned(),
+            Err(_) => String::from_utf8_lossy(&payload).into_owned(),
+        };
+        session.text(owned).await.map_err(|_| ())
     }
 }
 
 
@@ -17,7 +17,7 @@ once_cell = { workspace = true }
 hex = { workspace = true }
 sha2 = { workspace = true }
 thiserror = { workspace = true }
-datafusion-common = { workspace = true, optional = true }
+datafusion-common = { workspace = true, optional = true, default-features = false }
 parking_lot = { workspace = true }
 storekey = { workspace = true }
 # Optional dependencies - only enabled via features
 
@@ -857,9 +857,21 @@ impl WireNotification {
         let mut buf = Vec::with_capacity(est);
 
         buf.extend_from_slice(b"{\"type\":\"change\",\"subscription_id\":\"");
-        // Escape the subscription_id JSON-safely (ids are alphanumeric, but be safe).
-        let escaped = self.subscription_id.replace('\\', "\\\\").replace('"', "\\\"");
-        buf.extend_from_slice(escaped.as_bytes());
+        // Fast path: the overwhelming majority of real subscription IDs are
+        // plain ASCII (UUIDs, cuids, alnum+`-_`), so we can splice the raw
+        // bytes without allocating a second String for escaping. Only fall
+        // back to the allocating path when we actually see a byte that would
+        // need JSON escaping.
+        let sid_bytes = self.subscription_id.as_bytes();
+        let needs_escape = sid_bytes
+            .iter()
+            .any(|&b| b == b'\\' || b == b'"' || b < 0x20 || b >= 0x7f);
+        if needs_escape {
+            let escaped = self.subscription_id.replace('\\', "\\\\").replace('"', "\\\"");
+            buf.extend_from_slice(escaped.as_bytes());
+        } else {
+            buf.extend_from_slice(sid_bytes);
+        }
         buf.extend_from_slice(b"\",\"change_type\":\"");
         buf.extend_from_slice(p.change_type.as_str().as_bytes());
         buf.push(b'"');
 
@@ -296,10 +296,43 @@ impl InitialDataFetcher {
         })
     }
 
-    /// Compute snapshot end sequence for a subscription
+    /// Compute snapshot end sequence for a subscription.
     ///
-    /// Uses MAX(_seq) with the same filters as initial data to define a snapshot boundary.
+    /// Fast path: since `_seq` is a Snowflake ID with embedded timestamp, the
+    /// maximum possible `_seq` at the current wall-clock millisecond is an
+    /// upper bound on every row already written. Any write performed *after*
+    /// this boundary is computed will get a strictly larger `_seq` (different
+    /// timestamp component) and therefore flow through the live notification
+    /// path, not the initial snapshot.
+    ///
+    /// This removes an entire DataFusion execution from the subscribe critical
+    /// path (previously ~several ms to tens of ms depending on planning cost),
+    /// which is one of the biggest wins for time-to-first-row.
+    ///
+    /// All arguments are accepted for API compatibility; `role`, `table_id`,
+    /// `table_type`, `options`, and `where_clause` are unused on the fast path.
     pub async fn compute_snapshot_end_seq(
+        &self,
+        _live_id: &kalamdb_commons::models::LiveQueryId,
+        _role: Role,
+        _table_id: &TableId,
+        _table_type: TableType,
+        _options: &InitialDataOptions,
+        _where_clause: Option<&str>,
+    ) -> Result<Option<SeqId>, LiveError> {
+        let now_ms = std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .map(|d| d.as_millis() as u64)
+            .unwrap_or(SeqId::EPOCH);
+
+        match SeqId::max_id_for_timestamp(now_ms) {
+            Ok(seq) => Ok(Some(seq)),
+            Err(e) => Err(LiveError::Other(format!("Failed to compute snapshot boundary: {}", e))),
+        }
+    }
+
+    #[allow(dead_code)]
+    async fn compute_snapshot_end_seq_sql_fallback(
         &self,
         live_id: &kalamdb_commons::models::LiveQueryId,
         role: Role,
@@ -308,7 +341,6 @@ impl InitialDataFetcher {
         options: &InitialDataOptions,
         where_clause: Option<&str>,
     ) -> Result<Option<SeqId>, LiveError> {
-        // Extract user_id from LiveId for RLS
         let user_id = live_id.user_id().clone();
 
         let table_name = table_id.full_name();
 
@@ -73,7 +73,8 @@ fn intern_subscription_str(value: &str) -> Arc<str> {
 /// Maximum pending notifications per connection before dropping new ones.
 /// Keep this modest: large snapshot catch-up is handled by per-subscription
 /// flow control, while a smaller live buffer reduces worst-case memory per
-/// slow connection.
+/// slow connection. At 100k concurrent idle connections this directly
+/// governs the per-connection memory floor.
 pub const NOTIFICATION_CHANNEL_CAPACITY: usize = 256;
 
 /// Maximum pending control events per connection.
 
@@ -29,14 +29,26 @@ use tokio::sync::mpsc;
 /// Number of sharded notification workers.
 /// Deterministic routing by table_id hash preserves per-table ordering
 /// while achieving parallelism across different tables.
-const NUM_NOTIFY_WORKERS: usize = 4;
+///
+/// Scales with available CPUs (up to a hard cap) so multi-core deployments
+/// can fan out across more tables in parallel. Falls back to 4 on the
+/// (rare) platforms where `available_parallelism` is unavailable.
+fn num_notify_workers() -> usize {
+    // Cap at 16 to bound DashMap contention and worker overhead.
+    // Minimum of 4 preserves previous baseline behavior on small machines.
+    let cpus =
+        std::thread::available_parallelism().map(std::num::NonZeroUsize::get).unwrap_or(4);
+    cpus.clamp(4, 16)
+}
 
-/// Per-worker queue capacity. Total capacity = NUM_NOTIFY_WORKERS × this value.
+/// Per-worker queue capacity. Total capacity = workers × this value.
 const NOTIFY_QUEUE_PER_WORKER: usize = 4_096;
 
-/// Number of subscribers per parallel chunk for shared table streaming notification.
-/// Tuned to amortize tokio::spawn overhead while achieving parallelism at scale.
-const SHARED_NOTIFY_CHUNK_SIZE: usize = 256;
+/// Subscriber count above which we break fan-out into spawned chunks.
+/// For single-table fan-out at high subscriber counts (e.g. 100K on one
+/// table all hashing to one worker), spawning per-chunk lets the tokio
+/// runtime parallelise delivery across its thread pool.
+const SHARED_NOTIFY_CHUNK_SIZE: usize = 512;
 
 struct NotificationTask {
     user_id: Option<UserId>,
@@ -274,10 +286,11 @@ impl NotificationService {
     }
 
     pub fn new(registry: Arc<ConnectionsManager>) -> Arc<Self> {
-        let mut worker_txs = Vec::with_capacity(NUM_NOTIFY_WORKERS);
-        let mut worker_rxs = Vec::with_capacity(NUM_NOTIFY_WORKERS);
+        let worker_count = num_notify_workers();
+        let mut worker_txs = Vec::with_capacity(worker_count);
+        let mut worker_rxs = Vec::with_capacity(worker_count);
 
-        for _ in 0..NUM_NOTIFY_WORKERS {
+        for _ in 0..worker_count {
             let (tx, rx) = mpsc::channel(NOTIFY_QUEUE_PER_WORKER);
             worker_txs.push(tx);
             worker_rxs.push(rx);
@@ -424,35 +437,46 @@ impl NotificationService {
             );
         }
 
-        // Large fan-out: parallel chunked dispatch
-        // Large fan-out: collect handles once, then parallel chunked dispatch
-        let handles: Vec<SubscriptionHandle> =
+        // Large fan-out: spawn a task per chunk so the tokio runtime can
+        // parallelise delivery across its thread pool. When all subscribers
+        // are on the same table they hash to one notification worker —
+        // spawning is the only way to utilise multiple cores for the fan-out.
+        let handles_vec: Vec<SubscriptionHandle> =
             all_handles.iter().map(|entry| entry.value().clone()).collect();
 
-        let mut join_handles = Vec::with_capacity(
-            (handles.len() + SHARED_NOTIFY_CHUNK_SIZE - 1) / SHARED_NOTIFY_CHUNK_SIZE,
-        );
-
-        for chunk in handles.chunks(SHARED_NOTIFY_CHUNK_SIZE) {
-            let chunk = chunk.to_vec();
-            let nr = Arc::clone(&new_row);
-            let or = old_row.as_ref().map(Arc::clone);
-            let ct = change_type.clone();
-            let pk = Arc::clone(&pk_columns);
-
-            join_handles.push(tokio::spawn(async move {
-                dispatch_chunk(chunk.into_iter(), &nr, or.as_deref(), &ct, &pk, seq_value)
+        let table_id = table_id.clone();
+        let mut tasks = Vec::new();
+
+        for chunk in handles_vec.chunks(SHARED_NOTIFY_CHUNK_SIZE) {
+            let chunk_handles: Vec<SubscriptionHandle> = chunk.to_vec();
+            let new_row = Arc::clone(&new_row);
+            let old_row = old_row.as_ref().map(Arc::clone);
+            let change_type = change_type.clone();
+            let pk_columns = Arc::clone(&pk_columns);
+            let table_id = table_id.clone();
+
+            tasks.push(tokio::spawn(async move {
+                match dispatch_chunk(
+                    chunk_handles.into_iter(),
+                    &new_row,
+                    old_row.as_deref(),
+                    &change_type,
+                    &pk_columns,
+                    seq_value,
+                ) {
+                    Ok(count) => count,
+                    Err(e) => {
+                        log::error!("Notification dispatch error for table {}: {}", table_id, e);
+                        0
+                    },
+                }
             }));
         }
 
         let mut total = 0usize;
-        for jh in join_handles {
-            match jh.await {
-                Ok(Ok(count)) => total += count,
-                Ok(Err(e)) => {
-                    log::error!("Notification dispatch error for table {}: {}", table_id, e);
-                },
-                Err(e) => log::error!("Notification chunk task panicked: {}", e),
+        for task in tasks {
+            if let Ok(count) = task.await {
+                total += count;
             }
         }