diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 8f7a660..0000000
--- a/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-.cache
-.idea
-target
-*.iml
-tools/release
\ No newline at end of file
diff --git a/e2e-iot/.gitignore b/e2e-iot/.gitignore
new file mode 100644
index 0000000..1de21e9
--- /dev/null
+++ b/e2e-iot/.gitignore
@@ -0,0 +1,24 @@
+############################
+# Java / Maven build output
+############################
+target
+*.class
+*.jar
+dependency-reduced-pom.xml
+
+############################
+# IDE / Editor files
+############################
+.idea/
+.vscode/
+.classpath
+.project
+.settings
+*.iml
+*.swp
+*.swo
+
+############################
+# OS-specific
+############################
+.DS_Store
diff --git a/e2e-iot/DEPLOYMENT_INSTRUCTIONS.md b/e2e-iot/DEPLOYMENT_INSTRUCTIONS.md
new file mode 100644
index 0000000..ab19a6d
--- /dev/null
+++ b/e2e-iot/DEPLOYMENT_INSTRUCTIONS.md
@@ -0,0 +1,300 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Complete Platform Deployment Instructions
+
+This document provides step-by-step instructions to deploy the complete Fluss platform with Flink integration on AWS EKS.
+
+## Prerequisites
+
+- AWS CLI configured with appropriate credentials
+- kubectl installed and configured
+- terraform installed (>= 1.0)
+- helm installed (>= 3.0)
+- Docker installed (for building images)
+- Maven installed (for building Flink job JAR)
+
+## Step 1: Push Images to ECR
+
+First, build and push all required Docker images to AWS ECR:
+
+```bash
+cd fluss/benchmark/e2e-platform-aws
+./push-images-to-ecr.sh
+```
+
+This script will:
+- Build the Fluss demo image (for producer and Flink aggregator)
+- Build/pull the Apache Fluss image
+- Push both images to your AWS ECR repositories
+
+**Options:**
+- `./push-images-to-ecr.sh --all` - Push both images (default)
+- `./push-images-to-ecr.sh --producer-only` - Push only producer image
+- `./push-images-to-ecr.sh --fluss-only` - Push only Fluss image
+
+## Step 2: Set Environment Variables
+
+Load the default environment variables:
+
+```bash
+cd fluss/benchmark/e2e-platform-aws
+source ./default.env.sh
+```
+
+This sets:
+- `DEMO_IMAGE_REPO` - ECR repository for demo image
+- `DEMO_IMAGE_TAG` - Image tag (default: latest)
+- `FLUSS_IMAGE_REPO` - ECR repository for Fluss image
+- `NAMESPACE` - Kubernetes namespace (default: fluss)
+- `CLUSTER_NAME` - EKS cluster name (default: fluss-eks-cluster)
+- `REGION` - AWS region (default: us-west-2)
+
+## Step 3: Execute Deployment Scripts
+
+Navigate to the scripts directory and execute all deployment scripts in order:
+
+```bash
+cd fluss/benchmark/e2e-platform-aws/high-infra/k8s/scripts
+```
+
+Execute the following scripts in sequence:
+
+### 3.1: Deploy Infrastructure (Terraform)
+```bash
+./00-deploy-infra.sh
+```
+
+This script:
+- Creates EKS cluster using Terraform
+- Sets up VPC, subnets, and networking
+- Creates ECR repositories
+- Sets up S3 bucket for Flink checkpoints
+- Configures IAM roles and policies
+
+**Expected duration:** 15-20 minutes
+
+### 3.2: Update Kubeconfig
+```bash
+./01-update-kubeconfig.sh
+```
+
+This script:
+- Updates kubeconfig to connect to the EKS cluster
+- Verifies cluster connectivity
+
+### 3.3: Setup Storage
+```bash
+./02-setup-storage.sh
+```
+
+This script:
+- Creates local storage class for persistent volumes
+- Sets up persistent volume claims for Fluss components
+
+### 3.4: Deploy Components
+```bash
+./03-deploy-components.sh
+```
+
+This script:
+- Deploys ZooKeeper
+- Deploys Apache Fluss (Coordinator and Tablet Servers)
+- Deploys Flink cluster (JobManager and TaskManagers)
+- Deploys monitoring stack (Prometheus and Grafana)
+
+**Expected duration:** 5-10 minutes
+
+### 3.5: Verify Storage
+```bash
+./04-verify-storage.sh
+```
+
+This script:
+- Verifies persistent volume claims are bound
+- Checks storage mounts on Fluss tablet servers
+
+### 3.6: Deploy Producer
+```bash
+./05-deploy-producer.sh
+```
+
+This script:
+- Creates Fluss table with specified buckets
+- Deploys multi-instance producer job
+- Starts data ingestion
+
+**Expected duration:** 2-3 minutes
+
+### 3.7: Submit Flink Job
+```bash
+./06-submit-flink-job.sh
+```
+
+This script:
+- Submits Flink aggregator job to the cluster
+- Configures job to read from Fluss log table
+- Starts real-time aggregation
+
+**Expected duration:** 1-2 minutes
+
+### 3.8: Deploy Dashboard
+```bash
+./07-deploy-dashboard.sh
+```
+
+This script:
+- Deploys Grafana dashboard ConfigMap
+- Imports Fluss and Flink monitoring dashboard
+- Configures Prometheus data sources
+
+### 3.9: Verify Deployment
+```bash
+./08-verify-deployment.sh
+```
+
+This script:
+- Verifies all pods are running
+- Checks ServiceMonitors and PodMonitors
+- Verifies producer and Flink job are active
+- Displays access information
+
+### 3.10: View End-to-End Metrics
+```bash
+./09-view-metrics.sh
+```
+
+This script:
+- Changes to the e2e-platform-aws directory
+- Launches Grafana port-forward to view end-to-end metrics
+- Opens access to Grafana dashboard for platform monitoring
+- Displays real-time metrics and dashboards
+
+**Note:** This script will run in the foreground. Press Ctrl+C to stop port-forwarding.
+
+**Expected duration:** Runs until stopped (Ctrl+C)
+
+## Step 4: Access Services
+
+### Access Flink Web UI
+
+Use the port-forward script:
+```bash
+cd fluss/benchmark/e2e-platform-aws
+./port-forward-flink.sh
+```
+
+Or manually:
+```bash
+kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081
+```
+
+Then open: http://localhost:8081
+
+### Access Grafana Dashboard
+
+Use the port-forward script:
+```bash
+cd fluss/benchmark/e2e-platform-aws
+./port-forward-grafana.sh
+```
+
+Or manually:
+```bash
+kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80
+```
+
+Then open: http://localhost:3000
+- **Username:** `admin`
+- **Password:** `admin123`
+
+## Quick Reference
+
+### Check Pod Status
+```bash
+kubectl get pods -n fluss
+kubectl get pods -n monitoring
+```
+
+### Check Flink Job Status
+```bash
+kubectl get pods -n fluss -l app=flink
+kubectl logs -n fluss -l app=flink,component=jobmanager --tail=50
+```
+
+### Check Producer Status
+```bash
+kubectl get pods -n fluss -l app=fluss-producer
+kubectl logs -n fluss -l app=fluss-producer --tail=50
+```
+
+### Check Fluss Components
+```bash
+kubectl get pods -n fluss -l app.kubernetes.io/name=fluss
+kubectl get svc -n fluss
+```
+
+## Troubleshooting
+
+### Pods Not Starting
+```bash
+# Check pod events
+kubectl describe pod <pod-name> -n fluss
+
+# Check pod logs
+kubectl logs <pod-name> -n fluss
+```
+
+### Flink Job Not Running
+```bash
+# Check JobManager logs
+kubectl logs -n fluss -l app=flink,component=jobmanager --tail=100
+
+# Check TaskManager logs
+kubectl logs -n fluss -l app=flink,component=taskmanager --tail=100
+```
+
+### Storage Issues
+```bash
+# Check PVC status
+kubectl get pvc -n fluss
+
+# Check storage class
+kubectl get storageclass
+```
+
+### Network Issues
+```bash
+# Check services
+kubectl get svc -n fluss
+
+# Check endpoints
+kubectl get endpoints -n fluss
+```
+
+## Cleanup
+
+To destroy all resources:
+
+```bash
+cd fluss/benchmark/e2e-platform-aws/high-infra/terraform
+terraform destroy
+```
+
+**Warning:** This will delete the EKS cluster, VPC, and all associated resources.
+
diff --git a/e2e-iot/README.md b/e2e-iot/README.md
new file mode 100644
index 0000000..4bdc8f8
--- /dev/null
+++ b/e2e-iot/README.md
@@ -0,0 +1,507 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+<!--
+  2 Million Rows Per Second Benchmark Documentation
+  ===================================================
+  
+  This README documents the benchmark setup and results for testing Fluss's capability
+  to handle 2 million rows per second throughput with real-time Flink aggregation.
+  
+  Key Components:
+  - 8 Producer Instances (250K records/sec each)
+  - 100,000 Devices distributed across producers
+  - Fluss Table with 128 partitions across 3 tablet servers
+  - Flink Aggregation Job with 4 operators
+  
+  See bench-mark-images/ folder for performance diagrams and detailed metrics.
+-->
+
+# 2 Million Rows Per Second Benchmark
+
+This benchmark demonstrates Fluss's capability to handle **2 million rows per second** throughput with real-time Flink aggregation processing.
+
+## Overview
+
+This benchmark setup consists of:
+- **8 Producer Instances** generating sensor data at 250K records/sec each (2M total)
+- **100,000 Devices** distributed across all producers
+- **Fluss Table** with 128 partitions (buckets) across 3 tablet servers
+- **Flink Aggregation Job** processing the data stream in real-time
+
+## Producer Data Generation
+
+### Device Distribution Logic
+
+The producer uses a multi-instance architecture to distribute 100,000 devices across 8 producer instances:
+
+**Key Implementation Details:**
+- Total devices: **100,000** (constant across all instances)
+- Total producers: **8 instances** (instance IDs: 0-7)
+- Device ID range calculation per instance:
+  ```java
+  int devicesPerInstance = TOTAL_DEVICES / options.totalProducers;  // 100,000 / 8 = 12,500
+  int startDeviceId = options.instanceId * devicesPerInstance;
+  int endDeviceId = (options.instanceId == options.totalProducers - 1) 
+          ? TOTAL_DEVICES  // Last instance gets remainder
+          : (options.instanceId + 1) * devicesPerInstance;
+  ```
+
+**Device ID Ranges:**
+- Instance 0: Devices 0 to 12,499 (12,500 devices)
+- Instance 1: Devices 12,500 to 24,999 (12,500 devices)
+- Instance 2: Devices 25,000 to 37,499 (12,500 devices)
+- Instance 3: Devices 37,500 to 49,999 (12,500 devices)
+- Instance 4: Devices 50,000 to 62,499 (12,500 devices)
+- Instance 5: Devices 62,500 to 74,999 (12,500 devices)
+- Instance 6: Devices 75,000 to 87,499 (12,500 devices)
+- Instance 7: Devices 87,500 to 99,999 (12,500 devices)
+
+**Rate Distribution:**
+- Each producer instance generates **250,000 records/second**
+- Rate is evenly distributed across all devices in the instance's range
+- Each device generates approximately **20 records/second** (250,000 / 12,500)
+
+**Producer Configuration:**
+- Location: `high-infra/k8s/jobs/deploy-producer-multi-instance.sh`
+- Each instance runs as a Kubernetes Job with:
+  - Instance ID passed via `INSTANCE_ID` environment variable
+  - Total producers count via `TOTAL_PRODUCERS` environment variable
+  - Rate per instance: 250,000 records/sec
+  - Writer threads: 48 threads per instance
+  - Batch size: 128MB
+  - Buffer memory: 2GB
+  - Batch timeout: 90ms
+
+## Fluss Table Configuration
+
+### Table Details
+
+**Table Schema:**
+- **Database:** `iot`
+- **Table:** `sensor_readings`
+- **Primary Key:** `sensor_id` (INT)
+- **Partitions:** 128 buckets (distributed by `sensor_id` hash)
+
+**Table Columns:**
+1. `sensor_id` (INT) - Primary key, maps to device ID (0-99,999)
+2. `sensor_type` (INT) - Sensor type (1-8)
+3. `temperature` (DOUBLE) - Temperature reading
+4. `humidity` (DOUBLE) - Humidity reading
+5. `pressure` (DOUBLE) - Pressure reading
+6. `battery_level` (DOUBLE) - Battery level
+7. `status` (INT) - Device status (1=online, 2=offline, 3=maintenance, 4=error)
+8. `timestamp` (BIGINT) - Timestamp in milliseconds
+
+**Table Distribution:**
+- **Buckets:** 128 partitions
+- **Distribution:** Hash-based on `sensor_id` primary key
+- **Tablet Servers:** 3 replicas
+- **Replication:** Data is replicated across tablet servers for high availability
+
+**Tablet Server Configuration:**
+- **Count:** 3 tablet servers
+- **Instance Type:** i7i.8xlarge (or similar high-performance instance)
+- **Storage:** Local NVMe drives for optimal I/O performance
+- **Data Path:** `/opt/alldata/fluss/data` (mounted on NVMe)
+
+**Table Creation:**
+- Script: `high-infra/k8s/jobs/create-table.sh`
+- Default buckets: 128
+- Table is created before producer deployment
+- Uses `distributedBy(buckets, "sensor_id")` for hash-based distribution
+
+## Flink Aggregation Job
+
+### Job Overview
+
+The Flink job (`FlinkSensorAggregatorJob`) reads from the Fluss table, performs real-time aggregations, and outputs aggregated sensor data.
+
+**Job Configuration:**
+- **Parallelism:** 192 (distributed across 6 TaskManager pods)
+- **TaskManager Slots:** 32 slots per TaskManager
+- **Checkpointing:** Enabled with 5-minute intervals
+- **Checkpoint Mode:** AT_LEAST_ONCE
+- **State Backend:** S3-based (for fault tolerance)
+
+### Flink Operators
+
+The Flink job consists of **4 main operators**:
+
+#### 1. FlussChangelogFilter
+- **Type:** Filter operator
+- **Function:** Filters changelog stream to only process INSERT and UPDATE_AFTER events
+- **Input:** Raw changelog stream from Fluss table (includes INSERT, UPDATE_BEFORE, UPDATE_AFTER, DELETE)
+- **Output:** Only INSERT and UPDATE_AFTER events
+- **Purpose:** Ensures we only process new/updated records, ignoring DELETE and UPDATE_BEFORE events
+
+#### 2. FlussSensorReadingMapper
+- **Type:** Map operator (RichMapFunction)
+- **Function:** 
+  - Converts Fluss Row format to SensorReading object
+  - Extracts sensor data from table columns
+  - Calculates event time lag (difference between event time and processing time)
+  - Emits custom metrics:
+    - `records_in`: Counter for input records
+    - `event_time_lag_ms`: Gauge for event time lag
+- **Input:** Filtered Row objects from FlussChangelogFilter
+- **Output:** SensorReading objects with enriched metadata
+- **Metrics:** 
+  - `flink_taskmanager_job_task_operator_fluss_aggregator_records_in`
+  - `flink_taskmanager_job_task_operator_fluss_aggregator_event_time_lag_ms`
+
+#### 3. TumblingWindowAggregation
+- **Type:** Window operator with aggregation
+- **Function:**
+  - Groups records by `sensor_id` (keyBy operation)
+  - Applies tumbling window of 1 minute (processing time)
+  - Performs incremental aggregation:
+    - **SensorAggregateFunction:** Incrementally aggregates:
+      - Count of records
+      - Sum of temperature, humidity, pressure, battery_level
+      - Min/Max of temperature, humidity, pressure, battery_level
+      - First/last timestamp
+  - **WindowEnricher:** Finalizes window results:
+    - Calculates averages (sum / count)
+    - Computes time span (last - first timestamp)
+    - Creates SensorAggregate object
+- **Window Type:** TumblingProcessingTimeWindows (1 minute)
+- **Input:** SensorReading objects from FlussSensorReadingMapper
+- **Output:** SensorAggregate objects (one per sensor per window)
+- **Optimization:** 
+  - Disabled operator chaining for better parallelism
+  - Optimized min/max calculations (direct comparisons)
+  - Optimized average calculation (multiplication instead of division)
+
+#### 4. FlussAggregatorSink
+- **Type:** Sink operator (RichSinkFunction)
+- **Function:**
+  - Receives aggregated SensorAggregate objects
+  - Increments output counter for every record
+  - Logs aggregated results every 20,000 records
+  - Emits custom metrics:
+    - `records_out`: Counter for output records
+- **Input:** SensorAggregate objects from TumblingWindowAggregation
+- **Output:** Logged to Flink logs (can be extended to write to ClickHouse or other sinks)
+- **Metrics:** 
+  - `flink_taskmanager_job_task_operator_fluss_aggregator_records_out`
+- **Optimization:**
+  - Lazy evaluation of full record conversion (only when logging)
+  - Uses LOG.info() instead of System.out.println for async logging
+  - Disabled operator chaining for better resource isolation
+
+### Data Flow
+
+```
+Fluss Table (sensor_readings)
+    ↓
+FlussChangelogFilter (filter INSERT/UPDATE_AFTER)
+    ↓
+FlussSensorReadingMapper (convert Row → SensorReading)
+    ↓
+TumblingWindowAggregation (1-minute windows, group by sensor_id)
+    ↓
+FlussAggregatorSink (log aggregated results)
+```
+
+### Performance Characteristics
+
+- **Input Rate:** ~2 million records/second (from 8 producers)
+- **Window Size:** 1 minute tumbling windows
+- **Output Rate:** ~1,667 aggregates/second (100,000 devices / 60 seconds)
+- **Latency:** Sub-second processing latency
+- **Throughput:** Handles full 2M records/sec with optimized operators
+
+## Benchmark Results
+
+The following diagrams show detailed performance metrics from the 2 million rows per second benchmark run. Each diagram includes explanations of what it measures and what to look for.
+
+### Deployment Diagram
+
+![Deployment Architecture](bench-mark-images/10-Fluss%20Deployment.png)
+
+**What it shows:**
+This diagram illustrates the complete architecture and deployment topology of the benchmark setup. It shows:
+- **Infrastructure Components:** AWS EKS cluster with multiple node groups including Fluss coordinator (1 instance), Fluss tablet servers (3 instances), Flink JobManager (1 instance), Flink TaskManagers (6 instances), Producer nodes (4 nodes with 8 producer instances), and Monitoring stack (Prometheus + Grafana)
+- **Network Architecture:** VPC with public and private subnets, load balancers, service endpoints, and inter-component communication paths
+- **Data Flow:** Producer → Fluss Tablet Servers → Flink (via Fluss catalog) → Aggregated Output
+- **Storage:** S3 buckets for Flink checkpoints, NVMe storage for tablet servers, and EBS volumes for persistent data
+
+**Key Insights:** Demonstrates the distributed nature of the setup, shows how data flows through the system, illustrates the separation of compute and storage layers, and highlights the scalability of the architecture.
+
+---
+
+### 1. Producer Throughput
+
+![Producer Throughput](bench-mark-images/1-Fluss-Producer.png)
+
+**What it shows:**
+This diagram monitors the data generation rate from all producer instances. It displays:
+- **Total Records Per Second:** Aggregated rate from all 8 producer instances (target: 2,000,000 records/second)
+- **Per-Instance Rates:** Individual producer instance throughput (250K per instance × 8)
+- **Total Records:** Cumulative count of records generated over time
+- **Time Series:** Shows throughput stability and consistency over the benchmark duration
+
+**What to look for:**
+- Consistent rate at or near 2M records/sec
+- Minimal fluctuations indicating stable generation
+- All 8 instances contributing equally
+- No rate degradation over time
+
+---
+
+### 2. Flink Consumer
+
+![Flink Consumer](bench-mark-images/2-flink-consumer.png)
+
+**What it shows:**
+This diagram shows how Flink consumes data from the Fluss table. It displays:
+- **Consumer Throughput:** Rate at which Flink reads from Fluss (should match producer rate ~2M records/sec)
+- **Records In:** Number of records consumed per second
+- **Consumer Lag:** Delay between data production and consumption (should be minimal, < 1 second)
+- **Partition Distribution:** How data is distributed across Flink subtasks
+
+**What to look for:**
+- Consumer rate matching producer rate (~2M records/sec)
+- Low and stable consumer lag
+- Even distribution across Flink subtasks
+- No backpressure indicators
+
+---
+
+### 3. Flink Overall Operator Throughput
+
+![Flink Operator Throughput](bench-mark-images/3-flink-overall-operator-throughput.png)
+
+**What it shows:**
+This diagram shows throughput across all Flink operators in the pipeline. It displays:
+- **Operator-Level Metrics:** Throughput for FlussChangelogFilter, FlussSensorReadingMapper, TumblingWindowAggregation, and FlussAggregatorSink
+- **Records In/Out per Operator:** Shows data flow through each stage
+- **Operator Utilization:** CPU and memory usage per operator
+- **Parallelism Distribution:** How work is distributed across subtasks
+
+**What to look for:**
+- Consistent throughput across all operators
+- No significant drops between operators (indicating no data loss)
+- Balanced operator utilization
+- Identification of any bottleneck operators
+
+---
+
+### 4. Flink End-to-End Data Lag
+
+![Flink Data Lag](bench-mark-images/4-flink-end-to-end-data-lag.png)
+
+**What it shows:**
+This diagram measures the latency from data production to final aggregation output. It displays:
+- **Event Time Lag:** Difference between event timestamp and processing time (should be < 1 second)
+- **Processing Latency:** Time taken for data to flow through Flink pipeline
+- **Window Processing Delay:** Delay in window aggregation completion
+- **End-to-End Latency:** Total time from producer to sink (should be sub-second)
+
+**What to look for:**
+- Low and stable event time lag (< 1 second)
+- Consistent processing latency
+- No lag spikes indicating bottlenecks
+- Sub-second end-to-end latency
+
+---
+
+### 5. Flink Back Pressure
+
+![Flink Back Pressure](bench-mark-images/5-flink-back-pressure.png)
+
+**What it shows:**
+This diagram monitors backpressure indicators to identify bottlenecks in the Flink pipeline. It displays:
+- **Backpressure Status:** Per-operator backpressure indicators (OK, LOW, HIGH)
+- **Operator Busy Percentage:** CPU utilization per operator (should be < 100%)
+- **Queue Sizes:** Input/output queue sizes for each operator
+- **Idle Time:** Operator idle time (inverse of busy percentage)
+
+**What to look for:**
+- No backpressure (all operators showing OK status)
+- Balanced operator busy percentages
+- No operators stuck at 100% busy
+- Healthy buffer utilization
+
+---
+
+### 6. Fluss Tablet Server Throughput
+
+![Fluss Tablet Server Throughput](bench-mark-images/5-fluss-tablet-server-throughput.png)
+
+**What it shows:**
+This diagram shows the throughput and performance of Fluss tablet servers handling writes. It displays:
+- **Messages In Rate:** Rate at which tablet servers receive messages (should match producer rate ~2M messages/second)
+- **Bytes In/Out Rate:** Data transfer rates to/from tablet servers
+- **Write Throughput:** Aggregate write performance across all 3 tablet servers
+- **Per-Server Metrics:** Individual tablet server performance
+
+**What to look for:**
+- Sustained 2M messages/second across tablet servers
+- Even distribution across 3 tablet servers
+- High write throughput
+- No performance degradation
+
+---
+
+### 7. Fluss Tablet Server Request by Type
+
+![Fluss Tablet Server Requests](bench-mark-images/6-Fluss_tablet-server-request-by-type.png)
+
+**What it shows:**
+This diagram breaks down tablet server requests by operation type to understand workload patterns. It displays:
+- **Request Types:** 
+  - `produceLog`: Write operations (should be highest, matching producer write rate)
+  - `fetchLogClient`: Read operations from Flink (matching Flink read rate)
+  - `putKv`: Key-value operations
+  - Other request types
+- **Request Rates:** Requests per second per type
+- **Request Distribution:** Percentage breakdown of request types
+- **Per-Server Breakdown:** Request distribution across tablet servers
+
+**What to look for:**
+- High produceLog rate (matching producer write rate)
+- Moderate fetchLogClient rate (matching Flink read rate)
+- Balanced request distribution across tablet servers
+- No unusual request patterns
+
+---
+
+### 8. Fluss Tablet Server CPU
+
+![Fluss Tablet Server CPU](bench-mark-images/7-Fluss-tablet-server-CPU.png)
+
+**What it shows:**
+This diagram monitors CPU utilization of tablet servers to ensure they're not bottlenecked. It displays:
+- **CPU Usage:** CPU utilization percentage per tablet server (50-80% is healthy)
+- **CPU Load:** System load average
+- **JVM CPU:** Java process CPU usage
+- **CPU Trends:** CPU usage over time
+
+**What to look for:**
+- Moderate CPU usage (50-80% is healthy)
+- No CPU saturation (100% indicates bottleneck)
+- Balanced CPU across all 3 tablet servers
+- Stable CPU usage without spikes
+
+---
+
+### 9. Flink Aggregation Input
+
+![Flink Aggregation Input](bench-mark-images/8-Flink_aggregation-In.png)
+
+**What it shows:**
+This diagram shows the input rate to the Flink aggregation operators (window and sink). It displays:
+- **Aggregation Input Rate:** Records per second entering aggregation operators (~2M records/sec)
+- **Window Input:** Records entering tumbling window operator (~2M per minute)
+- **Key Distribution:** Distribution of records by sensor_id
+- **Input Stability:** Consistency of input rate over time
+
+**What to look for:**
+- Consistent input rate (~2M records/sec)
+- Stable input without drops
+- Even key distribution
+- No input rate spikes or gaps
+
+---
+
+### 9. Flink Aggregation Output
+
+![Flink Aggregation Output](bench-mark-images/9-Flink-aggregation-out.png)
+
+**What it shows:**
+This diagram shows the output rate from Flink aggregation (aggregated results per window). It displays:
+- **Aggregation Output Rate:** Aggregated records per second (~1,667/sec = 100K devices / 60 seconds)
+- **Window Output:** Number of aggregates produced per window (~100K aggregates per 1-minute window)
+- **Output Stability:** Consistency of output rate
+- **Aggregation Efficiency:** Ratio of input to output records (input:output should be ~1200:1 for 1-minute windows)
+
+**What to look for:**
+- Consistent output rate (~1,667 aggregates/sec)
+- One aggregate per device per window
+- Stable output without gaps
+- Proper aggregation (many input records → one output per device)
+
+---
+
+## Benchmark Summary
+
+These diagrams collectively demonstrate:
+
+1. **Producer Performance:** All 8 instances generating data at target rate (2M records/sec)
+2. **Fluss Performance:** Tablet servers handling writes efficiently with balanced load
+3. **Flink Performance:** Real-time processing with low latency and no backpressure
+4. **End-to-End Latency:** Sub-second processing from producer to aggregated output
+5. **System Stability:** Consistent performance throughout the benchmark duration
+6. **Scalability:** System handling 2M records/sec with room for growth
+
+### Key Performance Indicators
+
+- **Throughput:** Sustained 2,000,000 records/second
+- **Latency:** Sub-second end-to-end processing
+- **Availability:** No data loss, no backpressure
+- **Efficiency:** Balanced resource utilization across all components
+- **Scalability:** Linear scaling with additional resources
+
+## Infrastructure Costs
+
+The benchmark infrastructure consists of the following AWS EC2 instances running 24/7 for one month (30 days):
+
+### Instance Breakdown
+
+1. **Fluss Tablet Servers (i7i.8xlarge)**
+   - **Nodes:** 3 instances
+   - **Hourly Cost:** $3.02 per hour
+   - **Monthly Cost:** 3 × 24 × 30 × $3.02 = **$6,523.20**
+
+2. **Flink TaskManagers (C5.4xlarge)**
+   - **Nodes:** 6 instances
+   - **Hourly Cost:** $0.68 per hour
+   - **Monthly Cost:** 6 × 24 × 30 × $0.68 = **$2,937.60**
+
+3. **Producer Nodes (C5.3xlarge)**
+   - **Nodes:** 8 instances
+   - **Hourly Cost:** $0.35 per hour
+   - **Monthly Cost:** 8 × 24 × 30 × $0.35 = **$2,016.00**
+
+### Total Monthly Cost
+
+**Total Infrastructure Cost:** $6,523.20 + $2,937.60 + $2,016.00 = **$11,476.80/month**
+
+*Note: This cost calculation does not include additional AWS services such as EKS cluster management, S3 storage, networking (VPC, NAT Gateway), load balancers, or monitoring stack (Prometheus/Grafana). Actual costs may vary based on AWS pricing changes and additional resource usage.*
+
+## Deployment
+
+For deployment instructions, see:
+- `high-infra/k8s/DEPLOYMENT.md`
+- `high-infra/DEPLOY-STEPS.md`
+
+## Monitoring
+
+Grafana dashboards are available for monitoring:
+- Producer metrics (records/sec, total records)
+- Flink metrics (input/output rates, event time lag, CPU/memory)
+- Fluss metrics (tablet server throughput, coordinator status)
+
+Access Grafana:
+```bash
+kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80
+# Open http://localhost:3000 (admin/admin123)
+```
+
diff --git a/e2e-iot/bench-mark-images/1-Fluss-Producer.png b/e2e-iot/bench-mark-images/1-Fluss-Producer.png
new file mode 100644
index 0000000..702b4a5
Binary files /dev/null and b/e2e-iot/bench-mark-images/1-Fluss-Producer.png differ
diff --git a/e2e-iot/bench-mark-images/10-Fluss Deployment.png b/e2e-iot/bench-mark-images/10-Fluss Deployment.png
new file mode 100644
index 0000000..ad82a77
Binary files /dev/null and b/e2e-iot/bench-mark-images/10-Fluss Deployment.png differ
diff --git a/e2e-iot/bench-mark-images/2-flink-consumer.png b/e2e-iot/bench-mark-images/2-flink-consumer.png
new file mode 100644
index 0000000..51f3498
Binary files /dev/null and b/e2e-iot/bench-mark-images/2-flink-consumer.png differ
diff --git a/e2e-iot/bench-mark-images/3-flink-overall-operator-throughput.png b/e2e-iot/bench-mark-images/3-flink-overall-operator-throughput.png
new file mode 100644
index 0000000..9c96434
Binary files /dev/null and b/e2e-iot/bench-mark-images/3-flink-overall-operator-throughput.png differ
diff --git a/e2e-iot/bench-mark-images/4-flink-end-to-end-data-lag.png b/e2e-iot/bench-mark-images/4-flink-end-to-end-data-lag.png
new file mode 100644
index 0000000..c8f819b
Binary files /dev/null and b/e2e-iot/bench-mark-images/4-flink-end-to-end-data-lag.png differ
diff --git a/e2e-iot/bench-mark-images/5-flink-back-pressure.png b/e2e-iot/bench-mark-images/5-flink-back-pressure.png
new file mode 100644
index 0000000..8051872
Binary files /dev/null and b/e2e-iot/bench-mark-images/5-flink-back-pressure.png differ
diff --git a/e2e-iot/bench-mark-images/5-fluss-tablet-server-throughput.png b/e2e-iot/bench-mark-images/5-fluss-tablet-server-throughput.png
new file mode 100644
index 0000000..41f3e30
Binary files /dev/null and b/e2e-iot/bench-mark-images/5-fluss-tablet-server-throughput.png differ
diff --git a/e2e-iot/bench-mark-images/6-Fluss_tablet-server-request-by-type.png b/e2e-iot/bench-mark-images/6-Fluss_tablet-server-request-by-type.png
new file mode 100644
index 0000000..d0f6ea0
Binary files /dev/null and b/e2e-iot/bench-mark-images/6-Fluss_tablet-server-request-by-type.png differ
diff --git a/e2e-iot/bench-mark-images/7-Fluss-tablet-server-CPU.png b/e2e-iot/bench-mark-images/7-Fluss-tablet-server-CPU.png
new file mode 100644
index 0000000..da7f962
Binary files /dev/null and b/e2e-iot/bench-mark-images/7-Fluss-tablet-server-CPU.png differ
diff --git a/e2e-iot/bench-mark-images/8-Flink_aggregation-In.png b/e2e-iot/bench-mark-images/8-Flink_aggregation-In.png
new file mode 100644
index 0000000..c16e3b8
Binary files /dev/null and b/e2e-iot/bench-mark-images/8-Flink_aggregation-In.png differ
diff --git a/e2e-iot/bench-mark-images/9-Flink-aggregation-out.png b/e2e-iot/bench-mark-images/9-Flink-aggregation-out.png
new file mode 100644
index 0000000..b4e9bdf
Binary files /dev/null and b/e2e-iot/bench-mark-images/9-Flink-aggregation-out.png differ
diff --git a/e2e-iot/bench-mark-images/README.md b/e2e-iot/bench-mark-images/README.md
new file mode 100644
index 0000000..979068a
--- /dev/null
+++ b/e2e-iot/bench-mark-images/README.md
@@ -0,0 +1,344 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+<!--
+  Benchmark Diagrams Documentation
+  =================================
+  
+  This folder contains performance benchmark diagrams from the 2 million rows per second test run.
+  Diagrams are numbered in sequence (10-deployment first, then 1-9) to show the progression
+  of benchmark analysis from architecture overview to detailed performance metrics.
+  
+  Each diagram includes:
+  - What it shows (metrics and data displayed)
+  - What to look for (key performance indicators)
+  - Insights about system behavior
+-->
+
+# Benchmark Diagrams
+
+This folder contains performance benchmark diagrams from the 2 million rows per second test run. The diagrams are numbered in sequence to show the progression of the benchmark analysis.
+
+## Diagram Sequence
+
+### 10. Deployment Diagram (`10-Fluss Deployment.png`)
+
+**Purpose:** Shows the complete architecture and deployment topology of the benchmark setup.
+
+**What it shows:**
+- **Infrastructure Components:**
+  - AWS EKS cluster with multiple node groups
+  - Fluss coordinator (1 instance)
+  - Fluss tablet servers (3 instances)
+  - Flink JobManager (1 instance)
+  - Flink TaskManagers (6 instances)
+  - Producer nodes (4 nodes with 8 producer instances)
+  - Monitoring stack (Prometheus + Grafana)
+  
+- **Network Architecture:**
+  - VPC with public and private subnets
+  - Load balancers and service endpoints
+  - Inter-component communication paths
+  
+- **Data Flow:**
+  - Producer → Fluss Tablet Servers
+  - Fluss → Flink (via Fluss catalog)
+  - Flink → Aggregated Output
+  
+- **Storage:**
+  - S3 buckets for Flink checkpoints
+  - NVMe storage for tablet servers
+  - EBS volumes for persistent data
+
+**Key Insights:**
+- Demonstrates the distributed nature of the setup
+- Shows how data flows through the system
+- Illustrates the separation of compute and storage layers
+- Highlights the scalability of the architecture
+
+---
+
+### 1. Producer Throughput (`1-Fluss-Producer.png`)
+
+**Purpose:** Monitors the data generation rate from all producer instances.
+
+**What it shows:**
+- **Total Records Per Second:** Aggregated rate from all 8 producer instances
+- **Per-Instance Rates:** Individual producer instance throughput
+- **Total Records:** Cumulative count of records generated
+- **Time Series:** Shows throughput over the benchmark duration
+
+**Key Metrics:**
+- Target: 2,000,000 records/second (250K per instance × 8)
+- Actual sustained rate during benchmark
+- Rate stability and consistency
+- Any rate fluctuations or drops
+
+**What to look for:**
+- Consistent rate at or near 2M records/sec
+- Minimal fluctuations indicating stable generation
+- All 8 instances contributing equally
+- No rate degradation over time
+
+---
+
+### 2. Flink Consumer (`2-flink-consumer.png`)
+
+**Purpose:** Shows how Flink consumes data from the Fluss table.
+
+**What it shows:**
+- **Consumer Throughput:** Rate at which Flink reads from Fluss
+- **Records In:** Number of records consumed per second
+- **Consumer Lag:** Delay between data production and consumption
+- **Partition Distribution:** How data is distributed across Flink subtasks
+
+**Key Metrics:**
+- Input records per second (should match producer rate)
+- Consumer lag (should be minimal, < 1 second)
+- Number of active consumers/subtasks
+- Throughput per subtask
+
+**What to look for:**
+- Consumer rate matching producer rate (~2M records/sec)
+- Low and stable consumer lag
+- Even distribution across Flink subtasks
+- No backpressure indicators
+
+---
+
+### 3. Flink Overall Operator Throughput (`3-flink-overall-operator-throughput.png`)
+
+**Purpose:** Shows throughput across all Flink operators in the pipeline.
+
+**What it shows:**
+- **Operator-Level Metrics:**
+  - FlussChangelogFilter throughput
+  - FlussSensorReadingMapper throughput
+  - TumblingWindowAggregation throughput
+  - FlussAggregatorSink throughput
+  
+- **Records In/Out per Operator:** Shows data flow through each stage
+- **Operator Utilization:** CPU and memory usage per operator
+- **Parallelism Distribution:** How work is distributed across subtasks
+
+**Key Metrics:**
+- Records in per second for each operator
+- Records out per second for each operator
+- Operator busy percentage
+- Throughput bottlenecks (if any)
+
+**What to look for:**
+- Consistent throughput across all operators
+- No significant drops between operators (indicating no data loss)
+- Balanced operator utilization
+- Identification of any bottleneck operators
+
+---
+
+### 4. Flink End-to-End Data Lag (`4-flink-end-to-end-data-lag.png`)
+
+**Purpose:** Measures the latency from data production to final aggregation output.
+
+**What it shows:**
+- **Event Time Lag:** Difference between event timestamp and processing time
+- **Processing Latency:** Time taken for data to flow through Flink pipeline
+- **Window Processing Delay:** Delay in window aggregation completion
+- **End-to-End Latency:** Total time from producer to sink
+
+**Key Metrics:**
+- Average event time lag (milliseconds)
+- P95/P99 event time lag
+- Processing time latency
+- Window completion delay
+
+**What to look for:**
+- Low and stable event time lag (< 1 second)
+- Consistent processing latency
+- No lag spikes indicating bottlenecks
+- Sub-second end-to-end latency
+
+---
+
+### 5. Flink Back Pressure (`5-flink-back-pressure.png`)
+
+**Purpose:** Monitors backpressure indicators to identify bottlenecks in the Flink pipeline.
+
+**What it shows:**
+- **Backpressure Status:** Per-operator backpressure indicators
+- **Operator Busy Percentage:** CPU utilization per operator
+- **Queue Sizes:** Input/output queue sizes for each operator
+- **Idle Time:** Operator idle time (inverse of busy percentage)
+
+**Key Metrics:**
+- Backpressure status (OK, LOW, HIGH)
+- Operator busy percentage (should be < 100%)
+- Input/output buffer utilization
+- Downstream operator blocking indicators
+
+**What to look for:**
+- No backpressure (all operators showing OK status)
+- Balanced operator busy percentages
+- No operators stuck at 100% busy
+- Healthy buffer utilization
+
+---
+
+### 6. Fluss Tablet Server Throughput (`5-fluss-tablet-server-throughput.png`)
+
+**Purpose:** Shows the throughput and performance of Fluss tablet servers handling writes.
+
+**What it shows:**
+- **Messages In Rate:** Rate at which tablet servers receive messages
+- **Bytes In/Out Rate:** Data transfer rates to/from tablet servers
+- **Write Throughput:** Aggregate write performance across all 3 tablet servers
+- **Per-Server Metrics:** Individual tablet server performance
+
+**Key Metrics:**
+- Messages per second (should match producer rate)
+- Bytes per second (incoming writes)
+- Replication bytes per second
+- Per-tablet-server distribution
+
+**What to look for:**
+- Sustained 2M messages/second across tablet servers
+- Even distribution across 3 tablet servers
+- High write throughput
+- No performance degradation
+
+---
+
+### 7. Fluss Tablet Server Request by Type (`6-Fluss_tablet-server-request-by-type.png`)
+
+**Purpose:** Breaks down tablet server requests by operation type to understand workload patterns.
+
+**What it shows:**
+- **Request Types:**
+  - `produceLog`: Write operations (should be highest)
+  - `fetchLogClient`: Read operations from Flink
+  - `putKv`: Key-value operations
+  - Other request types
+  
+- **Request Rates:** Requests per second per type
+- **Request Distribution:** Percentage breakdown of request types
+- **Per-Server Breakdown:** Request distribution across tablet servers
+
+**Key Metrics:**
+- ProduceLog requests/sec (write operations)
+- FetchLogClient requests/sec (read operations from Flink)
+- Request rate per tablet server
+- Request type distribution
+
+**What to look for:**
+- High produceLog rate (matching producer write rate)
+- Moderate fetchLogClient rate (matching Flink read rate)
+- Balanced request distribution across tablet servers
+- No unusual request patterns
+
+---
+
+### 8. Fluss Tablet Server CPU (`7-Fluss-tablet-server-CPU.png`)
+
+**Purpose:** Monitors CPU utilization of tablet servers to ensure they're not bottlenecked.
+
+**What it shows:**
+- **CPU Usage:** CPU utilization percentage per tablet server
+- **CPU Load:** System load average
+- **JVM CPU:** Java process CPU usage
+- **CPU Trends:** CPU usage over time
+
+**Key Metrics:**
+- Average CPU usage per tablet server
+- Peak CPU usage
+- CPU load average
+- JVM CPU time
+
+**What to look for:**
+- Moderate CPU usage (50-80% is healthy)
+- No CPU saturation (100% indicates bottleneck)
+- Balanced CPU across all 3 tablet servers
+- Stable CPU usage without spikes
+
+---
+
+### 9. Flink Aggregation Input (`8-Flink_aggregation-In.png`)
+
+**Purpose:** Shows the input rate to the Flink aggregation operators (window and sink).
+
+**What it shows:**
+- **Aggregation Input Rate:** Records per second entering aggregation operators
+- **Window Input:** Records entering tumbling window operator
+- **Key Distribution:** Distribution of records by sensor_id
+- **Input Stability:** Consistency of input rate over time
+
+**Key Metrics:**
+- Records per second entering aggregation
+- Records per window (should be ~2M per minute)
+- Input rate stability
+- Key distribution evenness
+
+**What to look for:**
+- Consistent input rate (~2M records/sec)
+- Stable input without drops
+- Even key distribution
+- No input rate spikes or gaps
+
+---
+
+### 10. Flink Aggregation Output (`9-Flink-aggregation-out.png`)
+
+**Purpose:** Shows the output rate from Flink aggregation (aggregated results per window).
+
+**What it shows:**
+- **Aggregation Output Rate:** Aggregated records per second
+- **Window Output:** Number of aggregates produced per window
+- **Output Stability:** Consistency of output rate
+- **Aggregation Efficiency:** Ratio of input to output records
+
+**Key Metrics:**
+- Aggregated records per second (~1,667/sec = 100K devices / 60 seconds)
+- Records per window (should be ~100K aggregates per 1-minute window)
+- Output rate stability
+- Aggregation ratio (input:output should be ~1200:1 for 1-minute windows)
+
+**What to look for:**
+- Consistent output rate (~1,667 aggregates/sec)
+- One aggregate per device per window
+- Stable output without gaps
+- Proper aggregation (many input records → one output per device)
+
+---
+
+## Benchmark Summary
+
+These diagrams collectively demonstrate:
+
+1. **Producer Performance:** All 8 instances generating data at target rate (2M records/sec)
+2. **Fluss Performance:** Tablet servers handling writes efficiently with balanced load
+3. **Flink Performance:** Real-time processing with low latency and no backpressure
+4. **End-to-End Latency:** Sub-second processing from producer to aggregated output
+5. **System Stability:** Consistent performance throughout the benchmark duration
+6. **Scalability:** System handling 2M records/sec with room for growth
+
+## Key Performance Indicators
+
+- **Throughput:** Sustained 2,000,000 records/second
+- **Latency:** Sub-second end-to-end processing
+- **Availability:** No data loss, no backpressure
+- **Efficiency:** Balanced resource utilization across all components
+- **Scalability:** Linear scaling with additional resources
+
diff --git a/e2e-iot/default.env.sh b/e2e-iot/default.env.sh
new file mode 100755
index 0000000..f1c11ba
--- /dev/null
+++ b/e2e-iot/default.env.sh
@@ -0,0 +1,26 @@
+#!/bin/zsh
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
+AWS_REGION="us-west-2"
+export DEMO_IMAGE_REPO="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/fluss-demo"
+export DEMO_IMAGE_TAG="latest"
+export FLUSS_IMAGE_REPO="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/fluss"
+export NAMESPACE="fluss"
+export CLUSTER_NAME="fluss-eks-cluster"
+export REGION="${AWS_REGION}"
diff --git a/e2e-iot/ecr-repositories.txt b/e2e-iot/ecr-repositories.txt
new file mode 100644
index 0000000..7b2a965
--- /dev/null
+++ b/e2e-iot/ecr-repositories.txt
@@ -0,0 +1,26 @@
+# ECR Repository Details
+# Generated on: Mon Mar  2 20:51:31 IST 2026
+# AWS Account ID: 343218179954
+# AWS Region: us-west-2
+
+# Demo/Producer Image Repository
+DEMO_IMAGE_REPOSITORY="343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo"
+DEMO_IMAGE_TAG="latest"
+
+# For terraform.tfvars:
+demo_image_repository = "343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo"
+
+# Fluss Image Repository
+FLUSS_IMAGE_REPOSITORY="343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss"
+FLUSS_IMAGE_VERSION="0.8.0-incubating"
+
+# For terraform.tfvars:
+fluss_image_repository = "343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss"
+use_ecr_for_fluss = true
+
+# Full ECR Base URL
+ECR_BASE="343218179954.dkr.ecr.us-west-2.amazonaws.com"
+
+# To use these values in shell scripts:
+# source /Users/vijayabhaskarv/IOT/FLUSS/2-million-repo/final-test/fluss-benchmarks/e2e-platform-aws/ecr-repositories.txt
+# echo ${DEMO_IMAGE_REPOSITORY}
diff --git a/e2e-iot/fluss_flink_realtime/Dockerfile b/e2e-iot/fluss_flink_realtime/Dockerfile
new file mode 100644
index 0000000..f5f25f6
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/Dockerfile
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM eclipse-temurin:17-jre
+
+# Create Flink directory structure to match Flink image layout
+RUN mkdir -p /opt/flink/usrlib /opt/flink/lib /opt/flink/bin
+
+WORKDIR /opt/flink/usrlib
+
+# Install bash, curl, and getent for hostname resolution and Kubernetes API access
+# Retry apt-get update in case of transient mirror sync issues
+RUN apt-get update || (sleep 5 && apt-get update) && \
+    apt-get install -y bash curl netbase && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy the demo JAR to match Flink image structure
+COPY target/fluss-flink-realtime-demo.jar /opt/flink/usrlib/fluss-flink-realtime-demo.jar
+
+# Copy entrypoint script
+COPY entrypoint.sh /opt/flink/bin/entrypoint.sh
+RUN chmod +x /opt/flink/bin/entrypoint.sh
+
+# Use entrypoint script
+ENTRYPOINT ["/opt/flink/bin/entrypoint.sh"]
+
diff --git a/e2e-iot/fluss_flink_realtime/JDBCFlinkConsumer.java b/e2e-iot/fluss_flink_realtime/JDBCFlinkConsumer.java
new file mode 100644
index 0000000..2eba6c1
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/JDBCFlinkConsumer.java
@@ -0,0 +1,731 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.iot.pipeline.flink;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.flink.api.common.eventtime.WatermarkStrategy;
+import org.apache.flink.api.common.functions.AggregateFunction;
+import org.apache.flink.api.common.serialization.DeserializationSchema;
+import org.apache.flink.api.common.serialization.SimpleStringSchema;
+import org.apache.flink.api.common.state.ListState;
+import org.apache.flink.api.common.state.ListStateDescriptor;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.connector.pulsar.source.PulsarSource;
+import org.apache.flink.connector.pulsar.source.enumerator.cursor.StartCursor;
+import org.apache.flink.runtime.state.FunctionInitializationContext;
+import org.apache.flink.runtime.state.FunctionSnapshotContext;
+import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
+import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
+import org.apache.flink.streaming.api.windowing.time.Time;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+public class JDBCFlinkConsumer {
+    private static final ObjectMapper mapper = new ObjectMapper();
+    
+    /**
+     * AVRO Deserialization Schema for SensorData - converts to SensorRecord directly
+     */
+    public static class AvroSensorDataDeserializationSchema implements DeserializationSchema<SensorRecord> {
+        private transient Schema avroSchema;
+        private transient org.apache.avro.io.DatumReader<GenericRecord> datumReader;
+        
+        @Override
+        public void open(InitializationContext context) throws Exception {
+            // Load AVRO schema from resources (now included in JAR)
+            try (InputStream schemaStream = getClass().getResourceAsStream("/avro/SensorData.avsc")) {
+                if (schemaStream == null) {
+                    throw new RuntimeException("AVRO schema file not found: /avro/SensorData.avsc");
+                }
+                avroSchema = new Schema.Parser().parse(schemaStream);
+                datumReader = new org.apache.avro.generic.GenericDatumReader<>(avroSchema);
+                System.out.println("✅ Loaded AVRO schema from JAR: " + avroSchema.getName());
+            } catch (Exception e) {
+                throw new RuntimeException("Failed to load AVRO schema", e);
+            }
+        }
+        
+        @Override
+        public SensorRecord deserialize(byte[] message) throws IOException {
+            try {
+                // Deserialize AVRO binary message
+                org.apache.avro.io.Decoder decoder = org.apache.avro.io.DecoderFactory.get().binaryDecoder(message, null);
+                GenericRecord avroRecord = datumReader.read(null, decoder);
+                
+                // Convert directly to SensorRecord to avoid Kryo serialization issues
+                return new SensorRecord(avroRecord);
+            } catch (Exception e) {
+                throw new IOException("Failed to deserialize AVRO message", e);
+            }
+        }
+        
+        @Override
+        public boolean isEndOfStream(SensorRecord nextElement) {
+            return false;
+        }
+        
+        @Override
+        public TypeInformation<SensorRecord> getProducedType() {
+            return TypeInformation.of(SensorRecord.class);
+        }
+    }
+    
+    public static void main(String[] args) throws Exception {
+        String pulsarUrl = System.getenv().getOrDefault("PULSAR_URL", "pulsar://localhost:6650");
+        String pulsarAdminUrl = System.getenv().getOrDefault("PULSAR_ADMIN_URL", "http://localhost:8080");
+        String baseTopicName = System.getenv().getOrDefault("PULSAR_TOPIC", "persistent://public/default/iot-sensor-data");
+        String clickhouseUrl = System.getenv().getOrDefault("CLICKHOUSE_URL", "jdbc:clickhouse://localhost:8123/benchmark");
+        
+        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+        // Use parallelism from FlinkDeployment YAML or default to 4
+        // env.setParallelism(4);  // REMOVED - let YAML control parallelism
+        
+        // NOTE: Checkpointing is now configured via FlinkDeployment YAML
+        // The config includes: interval, mode (EXACTLY_ONCE), state backend (RocksDB), etc.
+        // This code is checkpoint-aware and will participate in checkpointing automatically
+        
+        System.out.println("Starting JDBC Flink IoT Consumer with AVRO Support and 1-Minute Aggregation...");
+        System.out.println("Pulsar URL: " + pulsarUrl);
+        System.out.println("Pulsar Admin URL: " + pulsarAdminUrl);
+        System.out.println("Consuming Topic: " + baseTopicName + " (all partitions)");
+        System.out.println("ClickHouse URL: " + clickhouseUrl);
+        System.out.println("Schema Type: AVRO");
+        System.out.println("Checkpointing: Enabled via FlinkDeployment config");
+        System.out.println("Aggregation: 1-minute tumbling windows per device_id with keyBy()");
+        System.out.println("Expected reduction: 30K msgs/sec → ~500-1000 aggregated records/min");
+        System.out.println("Using: Official Flink Pulsar Connector with AVRO deserialization");
+        
+               // Create Pulsar source using official Flink connector with AVRO deserialization
+               // The connector will automatically discover and consume from all partitions of the topic.
+               PulsarSource<SensorRecord> source = PulsarSource.builder()
+                       .setServiceUrl(pulsarUrl)
+                       .setAdminUrl(pulsarAdminUrl)
+                       .setTopics(baseTopicName) // Changed from topicName to baseTopicName
+                       .setSubscriptionName("flink-jdbc-consumer-avro")
+                       .setDeserializationSchema(new AvroSensorDataDeserializationSchema())
+                       .setStartCursor(StartCursor.earliest())
+                       .build();
+
+               DataStream<SensorRecord> sensorStream = env.fromSource(
+                       source,
+                       WatermarkStrategy.noWatermarks(),
+                       "Pulsar AVRO IoT Source"
+               );
+        
+        // Aggregate by device_id over 1-minute windows
+        sensorStream
+                .keyBy(record -> record.device_id)
+                .window(TumblingProcessingTimeWindows.of(Time.minutes(1)))
+                .aggregate(new SensorAggregator())
+                .addSink(new ClickHouseJDBCSink(clickhouseUrl));
+        
+        System.out.println("JDBC Flink job started!");
+        env.execute("JDBC IoT Data Pipeline");
+    }
+    
+        public static class SensorRecord implements java.io.Serializable {
+        // Matching benchmark.sensors_local schema
+        public String device_id;
+        public String device_type;
+        public String customer_id;
+        public String site_id;
+        public double latitude;
+        public double longitude;
+        public double altitude;
+        public double temperature;
+        public double humidity;
+        public double pressure;
+        public double co2_level;
+        public double noise_level;
+        public double light_level;
+        public int motion_detected;
+        public double battery_level;
+        public double signal_strength;
+        public double memory_usage;
+        public double cpu_usage;
+        public int status;
+        public int error_count;
+        public long packets_sent;
+        public long packets_received;
+        public long bytes_sent;
+        public long bytes_received;
+        
+        // Default constructor for aggregator
+        public SensorRecord() {
+        }
+        
+        public SensorRecord(JsonNode json) {
+            // Map from Pulsar JSON to ClickHouse schema
+            this.device_id = json.has("sensorId") ? json.get("sensorId").asText() : json.get("device_id").asText("device_unknown");
+            this.device_type = json.has("sensorType") ? json.get("sensorType").asText() : json.get("device_type").asText("unknown");
+            this.customer_id = json.has("customer_id") ? json.get("customer_id").asText() : "customer_0001";
+            this.site_id = json.has("site_id") ? json.get("site_id").asText() : json.has("location") ? json.get("location").asText() : "site_001";
+            
+            // Location data
+            JsonNode metadata = json.get("metadata");
+            if (metadata != null) {
+                this.latitude = metadata.has("latitude") ? metadata.get("latitude").asDouble() : 0.0;
+                this.longitude = metadata.has("longitude") ? metadata.get("longitude").asDouble() : 0.0;
+            } else {
+                this.latitude = json.has("latitude") ? json.get("latitude").asDouble() : 0.0;
+                this.longitude = json.has("longitude") ? json.get("longitude").asDouble() : 0.0;
+            }
+            this.altitude = json.has("altitude") ? json.get("altitude").asDouble() : 0.0;
+            
+            // Sensor readings
+            this.temperature = json.has("temperature") ? json.get("temperature").asDouble() : 0.0;
+            this.humidity = json.has("humidity") ? json.get("humidity").asDouble() : 0.0;
+            this.pressure = json.has("pressure") ? json.get("pressure").asDouble() : 1013.25;
+            this.co2_level = json.has("co2_level") ? json.get("co2_level").asDouble() : 400.0;
+            this.noise_level = json.has("noise_level") ? json.get("noise_level").asDouble() : 50.0;
+            this.light_level = json.has("light_level") ? json.get("light_level").asDouble() : 500.0;
+            this.motion_detected = json.has("motion_detected") ? json.get("motion_detected").asInt() : 0;
+            
+            // Device metrics
+            this.battery_level = json.has("batteryLevel") ? json.get("batteryLevel").asDouble() : 
+                                 json.has("battery_level") ? json.get("battery_level").asDouble() : 100.0;
+            this.signal_strength = json.has("signal_strength") ? json.get("signal_strength").asDouble() : -50.0;
+            this.memory_usage = json.has("memory_usage") ? json.get("memory_usage").asDouble() : 50.0;
+            this.cpu_usage = json.has("cpu_usage") ? json.get("cpu_usage").asDouble() : 30.0;
+            
+            // Status - convert string to int if needed
+            if (json.has("status")) {
+                if (json.get("status").isInt()) {
+                    this.status = json.get("status").asInt();
+                } else {
+                    String statusStr = json.get("status").asText().toLowerCase();
+                    this.status = statusStr.equals("online") ? 1 : statusStr.equals("offline") ? 2 : 
+                                  statusStr.equals("maintenance") ? 3 : 4;
+                }
+            } else {
+                this.status = 1; // Default: online
+            }
+            
+            this.error_count = json.has("error_count") ? json.get("error_count").asInt() : 0;
+            
+            // Network metrics
+            this.packets_sent = json.has("packets_sent") ? json.get("packets_sent").asLong() : 0L;
+            this.packets_received = json.has("packets_received") ? json.get("packets_received").asLong() : 0L;
+            this.bytes_sent = json.has("bytes_sent") ? json.get("bytes_sent").asLong() : 0L;
+            this.bytes_received = json.has("bytes_received") ? json.get("bytes_received").asLong() : 0L;
+        }
+        
+        public SensorRecord(GenericRecord avroRecord) {
+            // Map from Pulsar AVRO to ClickHouse schema - FIXED for actual AVRO schema
+            // Convert integer sensorId to string device_id
+            this.device_id = avroRecord.get("sensorId") != null ? 
+                "sensor_" + avroRecord.get("sensorId").toString() : "device_unknown";
+            
+            // Convert integer sensorType to string device_type
+            int sensorTypeInt = avroRecord.get("sensorType") != null ? 
+                ((Number) avroRecord.get("sensorType")).intValue() : 1;
+            this.device_type = getSensorTypeString(sensorTypeInt);
+            
+            this.customer_id = "customer_0001"; // Default value since not in AVRO schema
+            
+            // FIXED: No location field in AVRO schema - use sensorId for site_id
+            int sensorId = avroRecord.get("sensorId") != null ? 
+                ((Number) avroRecord.get("sensorId")).intValue() : 1;
+            this.site_id = "site_" + String.format("%03d", (sensorId % 100) + 1);
+            
+            // Location data - no metadata in new schema, use defaults
+            this.latitude = 0.0; // Not in optimized schema
+            this.longitude = 0.0; // Not in optimized schema
+            this.altitude = 0.0; // Not in AVRO schema
+            
+            // Sensor readings - FIXED: Use actual AVRO field names
+            this.temperature = avroRecord.get("temperature") != null ? ((Number) avroRecord.get("temperature")).doubleValue() : 0.0;
+            this.humidity = avroRecord.get("humidity") != null ? ((Number) avroRecord.get("humidity")).doubleValue() : 0.0;
+            this.pressure = avroRecord.get("pressure") != null ? ((Number) avroRecord.get("pressure")).doubleValue() : 1013.25;
+            this.co2_level = 400.0; // Default value since not in AVRO schema
+            this.noise_level = 50.0; // Default value since not in AVRO schema
+            this.light_level = 500.0; // Default value since not in AVRO schema
+            this.motion_detected = 0; // Default value since not in AVRO schema
+            
+            // Device metrics - FIXED: Use correct field name batteryLevel
+            this.battery_level = avroRecord.get("batteryLevel") != null ? ((Number) avroRecord.get("batteryLevel")).doubleValue() : 100.0;
+            this.signal_strength = -50.0; // Default value since not in AVRO schema
+            this.memory_usage = 50.0; // Default value since not in AVRO schema
+            this.cpu_usage = 30.0; // Default value since not in AVRO schema
+            
+            // Status - now integer in new schema
+            this.status = avroRecord.get("status") != null ? 
+                ((Number) avroRecord.get("status")).intValue() : 1;
+            
+            this.error_count = 0; // Default value since not in AVRO schema
+            
+            // Network metrics - default values since not in AVRO schema
+            this.packets_sent = 0L;
+            this.packets_received = 0L;
+            this.bytes_sent = 0L;
+            this.bytes_received = 0L;
+        }
+        
+        // Helper method to convert integer sensor type to string
+        // FIXED: Match producer's sensor type mapping (1-8)
+        private String getSensorTypeString(int sensorType) {
+            switch (sensorType) {
+                case 1: return "temperature_sensor";
+                case 2: return "humidity_sensor";
+                case 3: return "pressure_sensor";
+                case 4: return "motion_sensor";
+                case 5: return "light_sensor";
+                case 6: return "co2_sensor";
+                case 7: return "noise_sensor";
+                case 8: return "multisensor";
+                default: return "sensor_type_" + sensorType;
+            }
+        }
+    }
+    
+    /**
+     * Aggregator for sensor data over 1-minute windows
+     * Computes min, max, avg for sensor readings
+     */
+    public static class SensorAggregator implements AggregateFunction<SensorRecord, SensorAggregator.Accumulator, SensorRecord> {
+        
+        public static class Accumulator {
+            // Metadata (take first value)
+            String device_id;
+            String device_type;
+            String customer_id;
+            String site_id;
+            double latitude;
+            double longitude;
+            double altitude;
+            
+            // Sensor readings - track sum, min, max, count
+            long count = 0;
+            
+            // Temperature
+            double temp_sum = 0.0;
+            double temp_min = Double.MAX_VALUE;
+            double temp_max = Double.MIN_VALUE;
+            
+            // Humidity
+            double hum_sum = 0.0;
+            double hum_min = Double.MAX_VALUE;
+            double hum_max = Double.MIN_VALUE;
+            
+            // Pressure
+            double press_sum = 0.0;
+            double press_min = Double.MAX_VALUE;
+            double press_max = Double.MIN_VALUE;
+            
+            // CO2
+            double co2_sum = 0.0;
+            double co2_min = Double.MAX_VALUE;
+            double co2_max = Double.MIN_VALUE;
+            
+            // Noise
+            double noise_sum = 0.0;
+            double noise_min = Double.MAX_VALUE;
+            double noise_max = Double.MIN_VALUE;
+            
+            // Light
+            double light_sum = 0.0;
+            double light_min = Double.MAX_VALUE;
+            double light_max = Double.MIN_VALUE;
+            
+            // Battery
+            double battery_sum = 0.0;
+            double battery_min = Double.MAX_VALUE;
+            double battery_max = Double.MIN_VALUE;
+            
+            // Signal strength
+            double signal_sum = 0.0;
+            double signal_min = Double.MAX_VALUE;
+            double signal_max = Double.MIN_VALUE;
+            
+            // Status counters
+            int motion_detected_count = 0;
+            int error_count_sum = 0;
+            int status_sum = 0;
+            
+            // Network metrics
+            long packets_sent_sum = 0;
+            long packets_received_sum = 0;
+            long bytes_sent_sum = 0;
+            long bytes_received_sum = 0;
+        }
+        
+        @Override
+        public Accumulator createAccumulator() {
+            return new Accumulator();
+        }
+        
+        @Override
+        public Accumulator add(SensorRecord record, Accumulator acc) {
+            // First record - capture metadata
+            if (acc.count == 0) {
+                acc.device_id = record.device_id;
+                acc.device_type = record.device_type;
+                acc.customer_id = record.customer_id;
+                acc.site_id = record.site_id;
+                acc.latitude = record.latitude;
+                acc.longitude = record.longitude;
+                acc.altitude = record.altitude;
+            }
+            
+            acc.count++;
+            
+            // Temperature
+            acc.temp_sum += record.temperature;
+            acc.temp_min = Math.min(acc.temp_min, record.temperature);
+            acc.temp_max = Math.max(acc.temp_max, record.temperature);
+            
+            // Humidity
+            acc.hum_sum += record.humidity;
+            acc.hum_min = Math.min(acc.hum_min, record.humidity);
+            acc.hum_max = Math.max(acc.hum_max, record.humidity);
+            
+            // Pressure
+            acc.press_sum += record.pressure;
+            acc.press_min = Math.min(acc.press_min, record.pressure);
+            acc.press_max = Math.max(acc.press_max, record.pressure);
+            
+            // CO2
+            acc.co2_sum += record.co2_level;
+            acc.co2_min = Math.min(acc.co2_min, record.co2_level);
+            acc.co2_max = Math.max(acc.co2_max, record.co2_level);
+            
+            // Noise
+            acc.noise_sum += record.noise_level;
+            acc.noise_min = Math.min(acc.noise_min, record.noise_level);
+            acc.noise_max = Math.max(acc.noise_max, record.noise_level);
+            
+            // Light
+            acc.light_sum += record.light_level;
+            acc.light_min = Math.min(acc.light_min, record.light_level);
+            acc.light_max = Math.max(acc.light_max, record.light_level);
+            
+            // Battery
+            acc.battery_sum += record.battery_level;
+            acc.battery_min = Math.min(acc.battery_min, record.battery_level);
+            acc.battery_max = Math.max(acc.battery_max, record.battery_level);
+            
+            // Signal
+            acc.signal_sum += record.signal_strength;
+            acc.signal_min = Math.min(acc.signal_min, record.signal_strength);
+            acc.signal_max = Math.max(acc.signal_max, record.signal_strength);
+            
+            // Status and counters
+            if (record.motion_detected == 1) acc.motion_detected_count++;
+            acc.error_count_sum += record.error_count;
+            acc.status_sum += record.status;
+            
+            // Network metrics
+            acc.packets_sent_sum += record.packets_sent;
+            acc.packets_received_sum += record.packets_received;
+            acc.bytes_sent_sum += record.bytes_sent;
+            acc.bytes_received_sum += record.bytes_received;
+            
+            return acc;
+        }
+        
+        @Override
+        public SensorRecord getResult(Accumulator acc) {
+            // Create aggregated sensor record with averages
+            SensorRecord result = new SensorRecord();
+            
+            result.device_id = acc.device_id;
+            result.device_type = acc.device_type;
+            result.customer_id = acc.customer_id;
+            result.site_id = acc.site_id;
+            result.latitude = acc.latitude;
+            result.longitude = acc.longitude;
+            result.altitude = acc.altitude;
+            
+            // Average values
+            result.temperature = acc.temp_sum / acc.count;
+            result.humidity = acc.hum_sum / acc.count;
+            result.pressure = acc.press_sum / acc.count;
+            result.co2_level = acc.co2_sum / acc.count;
+            result.noise_level = acc.noise_sum / acc.count;
+            result.light_level = acc.light_sum / acc.count;
+            result.battery_level = acc.battery_sum / acc.count;
+            result.signal_strength = acc.signal_sum / acc.count;
+            
+            // Motion detected if > 50% of readings had motion
+            result.motion_detected = (acc.motion_detected_count > acc.count / 2) ? 1 : 0;
+            
+            // Average status
+            result.status = (int) (acc.status_sum / acc.count);
+            result.error_count = acc.error_count_sum;
+            
+            // Total network metrics
+            result.packets_sent = acc.packets_sent_sum;
+            result.packets_received = acc.packets_received_sum;
+            result.bytes_sent = acc.bytes_sent_sum;
+            result.bytes_received = acc.bytes_received_sum;
+            
+            // Add CPU and memory usage (simple averages)
+            result.memory_usage = 50.0; // Placeholder
+            result.cpu_usage = 30.0; // Placeholder
+            
+            System.out.println("✅ Aggregated window: device=" + result.device_id + 
+                             ", count=" + acc.count + " records, avg_temp=" + 
+                             String.format("%.1f", result.temperature));
+            
+            return result;
+        }
+        
+        @Override
+        public Accumulator merge(Accumulator a, Accumulator b) {
+            // Merge two accumulators (for parallel processing)
+            a.count += b.count;
+            
+            // Temperature
+            a.temp_sum += b.temp_sum;
+            a.temp_min = Math.min(a.temp_min, b.temp_min);
+            a.temp_max = Math.max(a.temp_max, b.temp_max);
+            
+            // Humidity
+            a.hum_sum += b.hum_sum;
+            a.hum_min = Math.min(a.hum_min, b.hum_min);
+            a.hum_max = Math.max(a.hum_max, b.hum_max);
+            
+            // Pressure
+            a.press_sum += b.press_sum;
+            a.press_min = Math.min(a.press_min, b.press_min);
+            a.press_max = Math.max(a.press_max, b.press_max);
+            
+            // CO2
+            a.co2_sum += b.co2_sum;
+            a.co2_min = Math.min(a.co2_min, b.co2_min);
+            a.co2_max = Math.max(a.co2_max, b.co2_max);
+            
+            // Noise
+            a.noise_sum += b.noise_sum;
+            a.noise_min = Math.min(a.noise_min, b.noise_min);
+            a.noise_max = Math.max(a.noise_max, b.noise_max);
+            
+            // Light
+            a.light_sum += b.light_sum;
+            a.light_min = Math.min(a.light_min, b.light_min);
+            a.light_max = Math.max(a.light_max, b.light_max);
+            
+            // Battery
+            a.battery_sum += b.battery_sum;
+            a.battery_min = Math.min(a.battery_min, b.battery_min);
+            a.battery_max = Math.max(a.battery_max, b.battery_max);
+            
+            // Signal
+            a.signal_sum += b.signal_sum;
+            a.signal_min = Math.min(a.signal_min, b.signal_min);
+            a.signal_max = Math.max(a.signal_max, b.signal_max);
+            
+            // Counters
+            a.motion_detected_count += b.motion_detected_count;
+            a.error_count_sum += b.error_count_sum;
+            a.status_sum += b.status_sum;
+            
+            // Network
+            a.packets_sent_sum += b.packets_sent_sum;
+            a.packets_received_sum += b.packets_received_sum;
+            a.bytes_sent_sum += b.bytes_sent_sum;
+            a.bytes_received_sum += b.bytes_received_sum;
+            
+            return a;
+        }
+    }
+    
+    /**
+     * Checkpoint-aware ClickHouse JDBC Sink
+     * Flushes batches to ClickHouse during checkpoints for exactly-once semantics
+     */
+    public static class ClickHouseJDBCSink extends RichSinkFunction<SensorRecord> implements CheckpointedFunction {
+        private final String jdbcUrl;
+        private Connection connection;
+        private PreparedStatement insertStatement;
+        private int batchCount = 0;
+        private static final int BATCH_SIZE = 5000;  // Increased from 1000 to 5000 for better throughput
+        
+        // Checkpoint state
+        private transient ListState<Integer> batchCountState;
+        
+        public ClickHouseJDBCSink(String jdbcUrl) {
+            this.jdbcUrl = jdbcUrl;
+        }
+        
+        @Override
+        public void open(Configuration parameters) throws Exception {
+            super.open(parameters);
+            
+            System.out.println("Opening JDBC connection to ClickHouse: " + jdbcUrl);
+            
+            // Load ClickHouse JDBC driver
+            Class.forName("com.clickhouse.jdbc.ClickHouseDriver");
+            
+            // Add jdbcCompliant=false to avoid transaction warnings (ClickHouse doesn't support transactions)
+            String finalUrl = jdbcUrl;
+            if (!jdbcUrl.contains("jdbcCompliant")) {
+                finalUrl = jdbcUrl + (jdbcUrl.contains("?") ? "&" : "?") + "jdbcCompliant=false";
+            }
+            connection = DriverManager.getConnection(finalUrl);
+            // Note: ClickHouse doesn't support transactions, batching is handled automatically
+            
+            // Prepare INSERT statement for benchmark.sensors_local
+            insertStatement = connection.prepareStatement(
+                "INSERT INTO benchmark.sensors_local (" +
+                "device_id, device_type, customer_id, site_id, " +
+                "latitude, longitude, altitude, time, " +
+                "temperature, humidity, pressure, co2_level, noise_level, light_level, motion_detected, " +
+                "battery_level, signal_strength, memory_usage, cpu_usage, " +
+                "status, error_count, " +
+                "packets_sent, packets_received, bytes_sent, bytes_received" +
+                ") VALUES (?, ?, ?, ?, ?, ?, ?, now(), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
+            );
+            
+            System.out.println("JDBC connection established successfully!");
+        }
+        
+        @Override
+        public void invoke(SensorRecord record, Context context) throws Exception {
+            try {
+                // Insert into benchmark.sensors_local - all 25 fields
+                int idx = 1;
+                insertStatement.setString(idx++, record.device_id);
+                insertStatement.setString(idx++, record.device_type);
+                insertStatement.setString(idx++, record.customer_id);
+                insertStatement.setString(idx++, record.site_id);
+                insertStatement.setDouble(idx++, record.latitude);
+                insertStatement.setDouble(idx++, record.longitude);
+                insertStatement.setDouble(idx++, record.altitude);
+                // time is set by now() in SQL
+                insertStatement.setDouble(idx++, record.temperature);
+                insertStatement.setDouble(idx++, record.humidity);
+                insertStatement.setDouble(idx++, record.pressure);
+                insertStatement.setDouble(idx++, record.co2_level);
+                insertStatement.setDouble(idx++, record.noise_level);
+                insertStatement.setDouble(idx++, record.light_level);
+                insertStatement.setInt(idx++, record.motion_detected);
+                insertStatement.setDouble(idx++, record.battery_level);
+                insertStatement.setDouble(idx++, record.signal_strength);
+                insertStatement.setDouble(idx++, record.memory_usage);
+                insertStatement.setDouble(idx++, record.cpu_usage);
+                insertStatement.setInt(idx++, record.status);
+                insertStatement.setInt(idx++, record.error_count);
+                insertStatement.setLong(idx++, record.packets_sent);
+                insertStatement.setLong(idx++, record.packets_received);
+                insertStatement.setLong(idx++, record.bytes_sent);
+                insertStatement.setLong(idx++, record.bytes_received);
+                
+                // Add to batch instead of executing immediately
+                insertStatement.addBatch();
+                batchCount++;
+                
+                // Execute batch when it reaches BATCH_SIZE OR during checkpoint
+                // Checkpoints will force flush even if batch size not reached
+                if (batchCount >= BATCH_SIZE) {
+                    insertStatement.executeBatch();
+                    // No commit() needed - ClickHouse doesn't support transactions
+                    System.out.println("✅ Batch executed: " + batchCount + " records");
+                    batchCount = 0;
+                }
+                
+                // Log alerts (has_alert is automatically calculated by ClickHouse)
+                if (record.temperature > 35 || record.humidity > 80 || record.battery_level < 20) {
+                    String alertType = record.temperature > 35 ? "HIGH_TEMP" : 
+                                      record.humidity > 80 ? "HIGH_HUMIDITY" : "LOW_BATTERY";
+                    System.out.println("🚨 ALERT: " + record.device_id + " - " + alertType);
+                }
+                
+            } catch (SQLException e) {
+                System.err.println("JDBC Error processing data: " + e.getMessage());
+                e.printStackTrace();
+                // Reset batch on error (no rollback needed - ClickHouse doesn't support transactions)
+                batchCount = 0;
+            }
+        }
+        
+        @Override
+        public void snapshotState(FunctionSnapshotContext context) throws Exception {
+            // Called when Flink takes a checkpoint
+            // Flush any pending batch to ClickHouse before checkpoint completes
+            if (batchCount > 0) {
+                insertStatement.executeBatch();
+                // No commit() needed - ClickHouse doesn't support transactions
+                System.out.println("✅ Checkpoint " + context.getCheckpointId() + 
+                                 ": Flushed " + batchCount + " records to ClickHouse");
+                batchCount = 0;
+            }
+            
+            // Save batch count to checkpoint state (for monitoring)
+            batchCountState.clear();
+            batchCountState.add(batchCount);
+        }
+        
+        @Override
+        public void initializeState(FunctionInitializationContext context) throws Exception {
+            // Initialize checkpoint state
+            ListStateDescriptor<Integer> descriptor = new ListStateDescriptor<>(
+                "batch-count-state",
+                TypeInformation.of(Integer.class)
+            );
+            batchCountState = context.getOperatorStateStore().getListState(descriptor);
+            
+            if (context.isRestored()) {
+                // Restore batch count from checkpoint (if any)
+                for (Integer count : batchCountState.get()) {
+                    batchCount = count;
+                }
+                System.out.println("🔄 Restored from checkpoint - batch count: " + batchCount);
+            }
+        }
+        
+        @Override
+        public void close() throws Exception {
+            // Flush any remaining batch
+            try {
+                if (insertStatement != null && batchCount > 0) {
+                    insertStatement.executeBatch();
+                    // No commit() needed - ClickHouse doesn't support transactions
+                    System.out.println("✅ Final batch executed: " + batchCount + " records");
+                }
+            } catch (SQLException e) {
+                System.err.println("Error flushing final batch: " + e.getMessage());
+            }
+            
+            System.out.println("Closing JDBC connections...");
+            if (insertStatement != null) {
+                insertStatement.close();
+            }
+            if (connection != null) {
+                connection.close();
+            }
+            super.close();
+        }
+    }
+}
\ No newline at end of file
diff --git a/e2e-iot/fluss_flink_realtime/TEST_LOCAL.md b/e2e-iot/fluss_flink_realtime/TEST_LOCAL.md
new file mode 100644
index 0000000..86a41b5
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/TEST_LOCAL.md
@@ -0,0 +1,207 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Local Testing Guide
+
+This guide explains how to test the Fluss producer and Flink job locally with the minimal schema.
+
+## Prerequisites
+
+1. **Maven** - For building the JAR
+2. **Fluss 0.8.0** - Extracted to `demos/demo/deploy_local_kind_fluss/fluss-0.8.0-incubating/`
+3. **Flink 1.20.3** (optional) - For running Flink job locally
+4. **Java 11+** - For running Java applications
+
+## Quick Test (Automated)
+
+Run the automated test script:
+
+```bash
+cd /Users/vijayabhaskarv/IOT/FLUSS
+./demos/demo/fluss_flink_realtime_demo/test-local.sh
+```
+
+This script will:
+1. Build the demo JAR
+2. Start Fluss local cluster
+3. Create table with 48 buckets
+4. Start producer (instance 0, 100K devices)
+5. Start Flink aggregation job (if Flink is available)
+
+## Manual Testing (Step-by-Step)
+
+### Step 1: Build the JAR
+
+```bash
+cd /Users/vijayabhaskarv/IOT/FLUSS
+mvn -pl demos/demo/fluss_flink_realtime_demo -am clean package
+```
+
+### Step 2: Start Fluss Local Cluster
+
+```bash
+cd demos/demo/deploy_local_kind_fluss/fluss-0.8.0-incubating
+./bin/local-cluster.sh start
+```
+
+Wait for Fluss to be ready (check coordinator on port 9123):
+```bash
+# Wait until this succeeds
+nc -z localhost 9123
+```
+
+### Step 3: Create Table with 48 Buckets
+
+```bash
+cd /Users/vijayabhaskarv/IOT/FLUSS
+java --add-opens=java.base/java.util=ALL-UNNAMED \
+     --add-opens=java.base/java.lang=ALL-UNNAMED \
+     --add-opens=java.base/java.nio=ALL-UNNAMED \
+     --add-opens=java.base/java.time=ALL-UNNAMED \
+     -cp demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+     org.apache.fluss.benchmark.e2eplatformaws.setup.CreateTableWithBuckets \
+     localhost:9123 iot sensor_readings 48 true
+```
+
+### Step 4: Start Producer
+
+**Single instance (100K devices):**
+```bash
+java --add-opens=java.base/java.util=ALL-UNNAMED \
+     --add-opens=java.base/java.lang=ALL-UNNAMED \
+     --add-opens=java.base/java.nio=ALL-UNNAMED \
+     --add-opens=java.base/java.time=ALL-UNNAMED \
+     -cp demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+     org.apache.fluss.benchmark.e2eplatformaws.producer.FlussSensorProducerAppMultiInstance \
+     --bootstrap localhost:9123 \
+     --database iot \
+     --table sensor_readings \
+     --buckets 48 \
+     --total-producers 1 \
+     --instance-id 0 \
+     --rate 10000 \
+     --writer-threads 4 \
+     --flush 10000 \
+     --stats 50000
+```
+
+**Multiple instances (4 instances, 25K devices each):**
+
+Terminal 1 (Instance 0):
+```bash
+java --add-opens=java.base/java.util=ALL-UNNAMED \
+     --add-opens=java.base/java.lang=ALL-UNNAMED \
+     --add-opens=java.base/java.nio=ALL-UNNAMED \
+     --add-opens=java.base/java.time=ALL-UNNAMED \
+     -cp demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+     org.apache.fluss.benchmark.e2eplatformaws.producer.FlussSensorProducerAppMultiInstance \
+     --bootstrap localhost:9123 \
+     --database iot \
+     --table sensor_readings \
+     --buckets 48 \
+     --total-producers 4 \
+     --instance-id 0 \
+     --rate 50000 \
+     --writer-threads 4
+```
+
+Terminal 2 (Instance 1):
+```bash
+# Same command but --instance-id 1
+--instance-id 1
+```
+
+Terminal 3 (Instance 2):
+```bash
+# Same command but --instance-id 2
+--instance-id 2
+```
+
+Terminal 4 (Instance 3):
+```bash
+# Same command but --instance-id 3
+--instance-id 3
+```
+
+### Step 5: Start Flink Job
+
+**If Flink is installed locally:**
+
+```bash
+# Start Flink cluster (if not running)
+cd /Users/vijayabhaskarv/IOT/FLUSS/flink-1.20.3
+./bin/start-cluster.sh
+
+# Submit Flink job
+./bin/flink run \
+    -c org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob \
+    /Users/vijayabhaskarv/IOT/FLUSS/demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+    --bootstrap localhost:9123 \
+    --database iot \
+    --table sensor_readings \
+    --window-minutes 1
+```
+
+**View Flink UI:**
+- Open http://localhost:8081 in browser
+- Check job status and metrics
+
+### Step 6: Verify Data
+
+**Check Fluss table:**
+```bash
+java --add-opens=java.base/java.util=ALL-UNNAMED \
+     --add-opens=java.base/java.lang=ALL-UNNAMED \
+     --add-opens=java.base/java.nio=ALL-UNNAMED \
+     --add-opens=java.base/java.time=ALL-UNNAMED \
+     -cp demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+     org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussTableLogPeek localhost:9123 iot sensor_readings 10
+```
+
+**Check Flink job output:**
+- The Flink job will print aggregated records every 20,000 records
+- Check Flink UI for job metrics and backpressure
+
+## Schema Verification
+
+The producer writes only these 8 fields to Fluss:
+- `sensor_id` (INT)
+- `sensor_type` (INT)
+- `temperature` (DOUBLE)
+- `humidity` (DOUBLE)
+- `pressure` (DOUBLE)
+- `battery_level` (DOUBLE)
+- `status` (INT)
+- `timestamp` (BIGINT)
+
+The Flink job reads these fields and adds default values for remaining fields at the sink level, matching JDBCFlinkConsumer.java behavior.
+
+## Cleanup
+
+```bash
+# Stop producer (Ctrl+C or kill PID)
+
+# Stop Flink cluster
+cd /Users/vijayabhaskarv/IOT/FLUSS/flink-1.20.3
+./bin/stop-cluster.sh
+
+# Stop Fluss cluster
+cd demos/demo/deploy_local_kind_fluss/fluss-0.8.0-incubating
+./bin/local-cluster.sh stop
+```
+
diff --git a/e2e-iot/fluss_flink_realtime/entrypoint.sh b/e2e-iot/fluss_flink_realtime/entrypoint.sh
new file mode 100644
index 0000000..dee316f
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/entrypoint.sh
@@ -0,0 +1,126 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+# Don't use set -euo pipefail as it might cause silent failures
+# set -euo pipefail
+
+# Entrypoint script to ensure IPv4 preference and start Java application
+# Write to both stderr and a log file to ensure we see output
+echo "[entrypoint] Script started" >&2
+echo "[entrypoint] Starting with args: $@" >&2
+echo "[entrypoint] Starting with args: $@" >> /tmp/entrypoint.log 2>&1 || true
+
+# Add coordinator hostname to /etc/hosts with IPv4 IP to force IPv4 resolution
+# This prevents Netty from resolving the hostname to IPv6
+COORD_HOST="coordinator-server-0.coordinator-server-hs.default.svc.cluster.local"
+COORD_IP=$(getent ahostsv4 coordinator-server-hs.default.svc.cluster.local 2>/dev/null | awk '{print $1}' | head -1)
+if [ -n "$COORD_IP" ]; then
+    echo "[entrypoint] Adding $COORD_IP $COORD_HOST to /etc/hosts to force IPv4 resolution" >&2
+    echo "$COORD_IP $COORD_HOST" >> /etc/hosts
+fi
+
+# Force IPv4 stack via environment variable (applies to all Java processes)
+# Also include --add-opens for Apache Arrow compatibility and Flink Kryo serialization
+# Use -Djava.net.preferIPv4Stack=true to disable IPv6 completely
+# Use -Djava.net.preferIPv4Addresses=true to prefer IPv4 when both are available
+export JAVA_TOOL_OPTIONS="--add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Addresses=true ${JAVA_TOOL_OPTIONS:-}"
+
+# Resolve hostnames in bootstrap arguments to IPv4 addresses
+# Replace hostname with IP to prevent Netty from doing DNS resolution that might return IPv6
+# For Kubernetes services, try to get the IP from the Kubernetes API first, then fall back to DNS
+RESOLVED_ARGS=()
+i=0
+ORIG_ARGS=("$@")
+while [ $i -lt ${#ORIG_ARGS[@]} ]; do
+  arg="${ORIG_ARGS[$i]}"
+  # Check if this is --bootstrap argument
+  if [ "$arg" = "--bootstrap" ] && [ $((i + 1)) -lt ${#ORIG_ARGS[@]} ]; then
+    RESOLVED_ARGS+=("$arg")
+    HOSTPORT="${ORIG_ARGS[$((i + 1))]}"
+    HOST=$(echo "$HOSTPORT" | cut -d: -f1)
+    PORT=$(echo "$HOSTPORT" | cut -d: -f2)
+    
+    # Try multiple methods to get IPv4 address
+    IPV4=""
+    
+    # Method 1: Try to get from Kubernetes API (if service account has permissions)
+    if [[ "$HOST" == *".svc.cluster.local" ]] || [[ "$HOST" == *".svc" ]]; then
+      # Extract service name and namespace from FQDN
+      # Format: <service-name>.<namespace>.svc.cluster.local or <service-name>.<namespace>.svc
+      if [[ "$HOST" == *".svc.cluster.local" ]]; then
+        # Remove .svc.cluster.local suffix
+        HOST_PART=$(echo "$HOST" | sed 's/\.svc\.cluster\.local$//')
+      else
+        # Remove .svc suffix
+        HOST_PART=$(echo "$HOST" | sed 's/\.svc$//')
+      fi
+      # Split into service name and namespace
+      SERVICE_NAME=$(echo "$HOST_PART" | cut -d'.' -f1)
+      NAMESPACE=$(echo "$HOST_PART" | cut -d'.' -f2-)
+      if [ -z "$NAMESPACE" ] || [ "$NAMESPACE" = "$SERVICE_NAME" ]; then
+        NAMESPACE="default"
+      fi
+      
+      # Try to get endpoint IPs from Kubernetes API
+      if [ -n "${KUBERNETES_SERVICE_HOST:-}" ] && [ -n "${KUBERNETES_SERVICE_PORT:-}" ]; then
+        TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token 2>/dev/null || echo "")
+        if [ -n "$TOKEN" ]; then
+          # Query endpoints API to get pod IPs
+          ENDPOINT_IP=$(curl -s -k -H "Authorization: Bearer $TOKEN" \
+            "https://${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT}/api/v1/namespaces/${NAMESPACE}/endpoints/${SERVICE_NAME}" \
+            2>/dev/null | grep -oE '"ip":"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+"' | head -1 | cut -d'"' -f4)
+          if [ -n "$ENDPOINT_IP" ]; then
+            IPV4="$ENDPOINT_IP"
+            echo "[entrypoint] Resolved $HOST to IPv4 via Kubernetes API: $IPV4" >&2
+          fi
+        fi
+      fi
+    fi
+    
+    # Method 2: Fall back to DNS resolution (IPv4 only)
+    if [ -z "$IPV4" ]; then
+      # Use getent with ahostsv4 to force IPv4 only
+      IPV4=$(getent ahostsv4 "$HOST" 2>/dev/null | awk '{print $1}' | head -1)
+      if [ -n "$IPV4" ]; then
+        echo "[entrypoint] Resolved $HOST to IPv4 via DNS: $IPV4" >&2
+      fi
+    fi
+    
+    # Use resolved IP or fall back to original hostname
+    if [ -n "$IPV4" ] && [ -n "$PORT" ]; then
+      echo "[entrypoint] Using $IPV4:$PORT instead of $HOSTPORT" >&2
+      RESOLVED_ARGS+=("$IPV4:$PORT")
+    else
+      echo "[entrypoint] Warning: Could not resolve $HOST to IPv4, using original: $HOSTPORT" >&2
+      RESOLVED_ARGS+=("$HOSTPORT")
+    fi
+    i=$((i + 2))
+  else
+    RESOLVED_ARGS+=("$arg")
+    i=$((i + 1))
+  fi
+done
+
+# Execute Java with resolved arguments (IP addresses instead of hostnames)
+# Also add IPv4 system properties and --add-opens flags directly to the java command
+# -Djava.net.preferIPv4Stack=true: Disables IPv6 support completely, forces IPv4-only
+# -Djava.net.preferIPv4Addresses=true: When both IPv4 and IPv6 are available, prefer IPv4
+# --add-opens: Required for Flink Kryo serialization to access internal Java classes (java.nio, java.util, java.time)
+echo "[entrypoint] Executing: java --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Addresses=true ${RESOLVED_ARGS[*]}" >&2
+exec java --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Addresses=true "${RESOLVED_ARGS[@]}"
+
diff --git a/e2e-iot/fluss_flink_realtime/fluss_writer_table_scan_commands.md b/e2e-iot/fluss_flink_realtime/fluss_writer_table_scan_commands.md
new file mode 100644
index 0000000..c8f959d
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/fluss_writer_table_scan_commands.md
@@ -0,0 +1,120 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Fluss Writer / Table Scan Commands
+
+Working directory for all commands:
+
+```
+cd /Users/vijayabhaskarv/IOT/FLUSS
+```
+
+## 1. Build the demo jar
+
+```
+mvn -pl demos/demo/fluss_flink_realtime_demo -am clean package
+```
+
+Output artifact:
+```
+demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar
+```
+
+## 2. Start / stop Fluss 0.8.0 local cluster
+
+Start:
+```
+fluss-0.8.0-incubating/bin/local-cluster.sh start
+```
+
+Stop:
+```
+fluss-0.8.0-incubating/bin/local-cluster.sh stop
+```
+
+## 3. Producer commands
+
+Continuous stream (Ctrl+C to stop):
+```
+java -jar demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  --bootstrap localhost:9123 \
+  --database iot \
+  --table sensor_readings \
+  --buckets 12 \
+  --rate 2000 \
+  --flush 5000 \
+  --stats 20000   # log throughput every 20k records (optional)
+```
+
+Limit by count or duration:
+```
+java -jar demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  --bootstrap localhost:9123 --database iot --table sensor_readings --count 50000
+
+java -jar demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  --bootstrap localhost:9123 --database iot --table sensor_readings --duration 5M
+```
+Add `--stats <records>` to control how often the producer logs overall/windowed throughput.
+
+## 4. Flink SQL client (metadata check)
+
+```
+flink-1.20.3/bin/sql-client.sh -e "CREATE CATALOG fluss WITH ('type'='fluss','bootstrap.servers'='localhost:9123'); \
+  USE CATALOG fluss; SHOW DATABASES;"
+```
+
+## 5. CLI helpers bundled in the jar
+
+### List databases / tables
+```
+java -cp demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussMetadataInspector localhost:9123
+```
+Optional single database:
+```
+java -cp demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussMetadataInspector localhost:9123 iot
+```
+
+### Peek change log records
+
+```
+java --add-opens=java.base/java.nio=ALL-UNNAMED \
+  -cp demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussTableLogPeek localhost:9123 iot sensor_readings 5
+```
+(Change `5` to print more/less records.)
+
+### Peek primary-key snapshot rows
+
+```
+java --add-opens=java.base/java.nio=ALL-UNNAMED \
+  -cp demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussPrimaryKeySnapshotPeek localhost:9123 iot sensor_readings 5
+```
+(Reads current table snapshot; only supports non-partitioned primary-key tables.)
+
+## 6. Flink aggregation job
+
+(Requires Flink cluster running in `flink-1.20.3`)
+```
+flink-1.20.3/bin/flink run \
+  -c org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob \
+  demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  --bootstrap localhost:9123 --database iot --table sensor_readings --window-minutes 1
+```
diff --git a/e2e-iot/fluss_flink_realtime/k8s/VERIFICATION_STEPS.md b/e2e-iot/fluss_flink_realtime/k8s/VERIFICATION_STEPS.md
new file mode 100644
index 0000000..25d7f02
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/k8s/VERIFICATION_STEPS.md
@@ -0,0 +1,225 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Fluss + Flink Demo Verification Steps
+
+## Prerequisites
+- Kind cluster will be created automatically
+- Docker must be running
+- kubectl must be installed
+- Maven must be installed (for building JAR)
+
+## Step-by-Step Instructions
+
+**Note:** All commands should be run from the workspace root: `/Users/vijayabhaskarv/IOT/FLUSS`
+
+### Step 1: Deploy Fluss on Kind Cluster
+This script will:
+- Create a new Kind cluster
+- Deploy Fluss coordinator and tablet servers
+- Patch coordinator to use IPv4 addresses
+- Wait for services to be ready
+
+```bash
+cd /Users/vijayabhaskarv/IOT/FLUSS/demos/demo/fluss_flink_realtime_demo
+./run_kind_demo.sh
+```
+
+Or from workspace root:
+```bash
+cd demos/demo/fluss_flink_realtime_demo
+./run_kind_demo.sh
+```
+
+**Expected output:**
+- Kind cluster created
+- Fluss pods running (coordinator-server-0, tablet-server-0, tablet-server-1, tablet-server-2)
+- Coordinator patched to advertise IPv4 IP addresses
+- Fluss accessible on localhost:9123
+
+**Wait for:** All Fluss pods to be in `Running` state:
+```bash
+kubectl get pods
+# Should show coordinator-server-0 and tablet-server-* pods as Running
+```
+
+---
+
+### Step 2: Build and Deploy Producer + Flink Aggregator Jobs
+This script will:
+- Build the demo JAR (if needed)
+- Build Docker image with all fixes
+- Load image into Kind cluster
+- Deploy producer and Flink aggregator as Kubernetes Jobs
+
+```bash
+cd /Users/vijayabhaskarv/IOT/FLUSS/demos/demo/fluss_flink_realtime_demo
+./k8s/deploy_k8s_jobs.sh
+```
+
+Or from workspace root:
+```bash
+cd demos/demo/fluss_flink_realtime_demo
+./k8s/deploy_k8s_jobs.sh
+```
+
+**Expected output:**
+- JAR built successfully
+- Docker image built
+- Image loaded into Kind
+- Producer job created
+- Flink aggregator job created
+
+---
+
+### Step 3: Verify Everything is Running
+
+#### Check Job Status
+```bash
+kubectl get jobs
+# Should show:
+# - fluss-producer: 1/1 completions
+# - flink-aggregator: 1/1 completions (or Running)
+```
+
+#### Check Pod Status
+```bash
+kubectl get pods
+# Should show:
+# - fluss-producer-*: Running or Completed
+# - flink-aggregator-*: Running
+# - coordinator-server-0: Running
+# - tablet-server-*: Running
+```
+
+#### Check Producer Logs
+```bash
+kubectl logs -l app=fluss-producer --tail=50 -f
+```
+
+**Expected output:**
+- "Producer started"
+- "Writing sensor data..."
+- Statistics showing records written
+- No IPv6 errors
+- No connection errors
+
+#### Check Flink Aggregator Logs
+```bash
+kubectl logs -l app=flink-aggregator --tail=50 -f
+```
+
+**Expected output:**
+- "Flink Sensor Aggregation Job started"
+- "Resolved coordinator-server-hs.default.svc.cluster.local to IPv4 via DNS: 10.244.x.x"
+- Windowed aggregation results like:
+  ```
+  SensorAggregate{sensorId=sensor-000001, window=[2025-11-18T...], avgTemp=20.5, ...}
+  ```
+- No `InaccessibleObjectException` errors
+- No `ClassNotFoundException` errors
+- Job running continuously
+
+---
+
+### Step 4: Monitor Aggregation Results
+
+Watch the Flink aggregator output:
+```bash
+kubectl logs -l app=flink-aggregator -f | grep "SensorAggregate"
+```
+
+**Expected:** Continuous stream of aggregated sensor data with:
+- Sensor IDs
+- Time windows (1-minute intervals)
+- Average temperature, humidity, pressure, battery
+- Sensor status (ONLINE, OFFLINE, DEGRADED, MAINTENANCE)
+
+---
+
+### Step 5: Verify Fixes Applied
+
+#### Check IPv4 Configuration
+```bash
+kubectl logs -l app=flink-aggregator | grep -i "ipv4\|resolved"
+# Should show IPv4 resolution messages
+```
+
+#### Check Java Module Opens
+```bash
+kubectl logs -l app=flink-aggregator | grep "entrypoint"
+# Should show: --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED
+```
+
+#### Check Coordinator Advertised Listeners
+```bash
+kubectl logs coordinator-server-0 | grep "advertised.listeners"
+# Should show: advertised.listeners: CLIENT://10.244.x.x:9124 (IPv4 IP, not hostname)
+```
+
+---
+
+## Troubleshooting
+
+### If producer fails:
+1. Check logs: `kubectl logs -l app=fluss-producer`
+2. Check coordinator: `kubectl logs coordinator-server-0`
+3. Verify coordinator is ready: `kubectl get pod coordinator-server-0`
+
+### If Flink aggregator fails:
+1. Check logs: `kubectl logs -l app=flink-aggregator`
+2. Look for specific error messages
+3. Verify all pods are running: `kubectl get pods`
+
+### If you see IPv6 errors:
+1. Verify coordinator patch: `kubectl get statefulset coordinator-server -o yaml | grep advertised`
+2. Check entrypoint.sh has IPv4 properties
+3. Restart coordinator: `kubectl delete pod coordinator-server-0`
+
+### If you see ClassNotFoundException:
+1. Rebuild JAR: `cd demos/demo/fluss_flink_realtime_demo && mvn -f pom.xml clean package`
+2. Rebuild Docker image: `cd demos/demo/fluss_flink_realtime_demo && docker build -t fluss-demo:latest .`
+3. Reload into Kind: `kind load docker-image fluss-demo:latest --name fluss-kind`
+4. Redeploy jobs: `cd demos/demo/fluss_flink_realtime_demo && kubectl delete job fluss-producer flink-aggregator && ./k8s/deploy_k8s_jobs.sh`
+
+---
+
+## Cleanup
+
+To stop and clean up everything:
+
+```bash
+# Delete Kubernetes jobs
+kubectl delete job fluss-producer flink-aggregator
+
+# Delete Kind cluster (this removes everything)
+kind delete cluster --name fluss-kind
+```
+
+---
+
+## Success Criteria
+
+✅ All pods running without errors
+✅ Producer writing data continuously
+✅ Flink aggregator processing and outputting windowed aggregations
+✅ No IPv6 connection errors
+✅ No Java module access errors
+✅ No missing class errors
+✅ Aggregation results appearing every minute
+
diff --git a/e2e-iot/fluss_flink_realtime/k8s/deploy_fluss_kind.sh b/e2e-iot/fluss_flink_realtime/k8s/deploy_fluss_kind.sh
new file mode 100755
index 0000000..ff9cefc
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/k8s/deploy_fluss_kind.sh
@@ -0,0 +1,161 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Automation script for deploying ZooKeeper + Fluss on a local Kind cluster.
+# It mirrors the steps in fluss_kubernetes_local_kind.md.
+
+KIND_NAME=${KIND_NAME:-fluss-kind}
+WORKDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+KIND_CONFIG="${WORKDIR}/kind-cluster-config.yaml"
+ZK_MANIFEST="${WORKDIR}/zookeeper-kind.yaml"
+FLUSS_VALUES="${WORKDIR}/fluss-values-kind.yaml"
+FLUSS_CHART_VERSION=${FLUSS_CHART_VERSION:-0.8.0-incubating}
+FLUSS_IMAGE=${FLUSS_IMAGE:-apache/fluss:0.8.0-incubating}
+
+cat <<'EOF' >"${KIND_CONFIG}"
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+  - role: control-plane
+    kubeadmConfigPatches:
+      - |
+        kind: InitConfiguration
+        nodeRegistration:
+          kubeletExtraArgs:
+            max-pods: "150"
+    extraPortMappings:
+      - containerPort: 30181
+        hostPort: 8081
+        protocol: TCP
+      - containerPort: 30923
+        hostPort: 9123
+        protocol: TCP
+      - containerPort: 30924
+        hostPort: 9124
+        protocol: TCP
+  - role: worker
+  - role: worker
+EOF
+
+echo "[1/7] Creating Kind cluster '${KIND_NAME}'..."
+kind create cluster --name "${KIND_NAME}" --config "${KIND_CONFIG}" >/dev/null
+
+# Wait until all Kind nodes report Ready before moving on.
+echo "[2/7] Waiting for Kind nodes to become Ready..."
+kubectl wait --for=condition=Ready node --all --timeout=180s >/dev/null
+kubectl get nodes
+
+# Optional resource tuning for docker-based Kind nodes (ignore failures on non-docker environments).
+for node in "${KIND_NAME}-control-plane" "${KIND_NAME}-worker" "${KIND_NAME}-worker2"; do
+  if docker inspect "$node" >/dev/null 2>&1; then
+    docker update --cpus 3 --memory 5g --memory-swap 5g "$node" >/dev/null 2>&1 || true
+  fi
+done
+
+cat <<'EOF' >"${ZK_MANIFEST}"
+apiVersion: v1
+kind: Service
+metadata:
+  name: zk-svc
+  namespace: default
+  labels:
+    app: zookeeper
+spec:
+  selector:
+    app: zookeeper
+  ports:
+    - name: client
+      port: 2181
+      targetPort: 2181
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: zk
+  namespace: default
+  labels:
+    app: zookeeper
+spec:
+  serviceName: zk-svc
+  replicas: 1
+  selector:
+    matchLabels:
+      app: zookeeper
+  template:
+    metadata:
+      labels:
+        app: zookeeper
+    spec:
+      containers:
+        - name: zookeeper
+          image: zookeeper:3.9.2
+          imagePullPolicy: IfNotPresent
+          ports:
+            - name: client
+              containerPort: 2181
+          resources:
+            requests:
+              cpu: "200m"
+              memory: "256Mi"
+            limits:
+              cpu: "500m"
+              memory: "512Mi"
+EOF
+
+echo "[3/7] Deploying ZooKeeper..."
+kubectl apply -f "${ZK_MANIFEST}" >/dev/null
+kubectl rollout status statefulset/zk --timeout=180s >/dev/null
+
+cat <<EOF >"${FLUSS_VALUES}"
+persistence:
+  enabled: false
+image:
+  registry: docker.io
+  repository: ${FLUSS_IMAGE%:*}
+  tag: ${FLUSS_IMAGE##*:}
+configurationOverrides:
+  "zookeeper.address": zk-svc.default.svc.cluster.local:2181
+EOF
+
+# Pre-load the Fluss image into all Kind nodes.
+echo "[4/7] Pre-loading ${FLUSS_IMAGE} into all Kind nodes..."
+if ! docker image inspect "${FLUSS_IMAGE}" >/dev/null 2>&1; then
+  echo "  Pulling ${FLUSS_IMAGE} from registry..."
+  docker pull "${FLUSS_IMAGE}" >/dev/null
+fi
+echo "  Loading ${FLUSS_IMAGE} into Kind nodes..."
+kind load docker-image "${FLUSS_IMAGE}" --name "${KIND_NAME}" >/dev/null
+echo "  ✓ Image pre-loaded into all Kind nodes"
+
+echo "[5/7] Installing Fluss Helm chart..."
+helm repo add fluss https://downloads.apache.org/incubator/fluss/helm-chart >/dev/null 2>&1 || true
+helm repo update >/dev/null
+helm install fluss fluss/fluss --version "${FLUSS_CHART_VERSION}" -f "${FLUSS_VALUES}" >/dev/null
+
+echo "[6/7] Waiting for Fluss pods..."
+kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=fluss --timeout=180s >/dev/null
+
+echo "[7/7] Current pod status:"
+kubectl get pods
+
+echo "\nDeployment complete. Sample next steps:"
+echo "  kubectl port-forward svc/coordinator-server-hs 9124:9124"
+echo "  helm uninstall fluss && kubectl delete -f ${ZK_MANIFEST}"
+echo "  kind delete cluster --name ${KIND_NAME}"
diff --git a/e2e-iot/fluss_flink_realtime/k8s/deploy_k8s_jobs.sh b/e2e-iot/fluss_flink_realtime/k8s/deploy_k8s_jobs.sh
new file mode 100755
index 0000000..5b5312b
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/k8s/deploy_k8s_jobs.sh
@@ -0,0 +1,77 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Script to build Docker image and deploy producer + Flink aggregator as Kubernetes Jobs
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# DEMO_DIR is the parent directory (where pom.xml, Dockerfile, etc. are located)
+DEMO_DIR=$(cd "${SCRIPT_DIR}/.." && pwd)
+WORKDIR=$(cd "${DEMO_DIR}/../../.." && pwd)
+cd "${WORKDIR}"
+
+KIND_NAME=${KIND_NAME:-fluss-kind}
+IMAGE_NAME="fluss-demo"
+IMAGE_TAG="latest"
+FULL_IMAGE="${IMAGE_NAME}:${IMAGE_TAG}"
+
+echo "=== Building and Deploying Fluss Demo Jobs to Kind ==="
+
+# Step 1: Build the demo JAR if needed
+if [ ! -f "${DEMO_DIR}/target/fluss-flink-realtime-demo.jar" ]; then
+    echo "[1/5] Building demo JAR..."
+    mvn -f "${DEMO_DIR}/pom.xml" clean package
+else
+    echo "[1/5] Demo JAR exists, skipping build"
+fi
+
+# Step 2: Build Docker image
+echo "[2/5] Building Docker image ${FULL_IMAGE}..."
+cd "${DEMO_DIR}"
+docker build -t "${FULL_IMAGE}" .
+
+# Step 3: Load image into Kind
+echo "[3/5] Loading ${FULL_IMAGE} into Kind cluster..."
+kind load docker-image "${FULL_IMAGE}" --name "${KIND_NAME}"
+
+# Step 4: Deploy producer Job
+echo "[4/5] Deploying producer Job..."
+kubectl apply -f "${SCRIPT_DIR}/k8s-producer-job.yaml"
+
+# Step 5: Deploy Flink aggregator Job
+echo "[5/5] Deploying Flink aggregator Job..."
+kubectl apply -f "${SCRIPT_DIR}/k8s-flink-aggregator-job.yaml"
+
+echo ""
+echo "=== Deployment Complete ==="
+echo ""
+echo "Check job status:"
+echo "  kubectl get jobs"
+echo "  kubectl get pods -l app=fluss-producer"
+echo "  kubectl get pods -l app=flink-aggregator"
+echo ""
+echo "View producer logs:"
+echo "  kubectl logs -l app=fluss-producer --tail=50 -f"
+echo ""
+echo "View aggregator logs:"
+echo "  kubectl logs -l app=flink-aggregator --tail=50 -f"
+echo ""
+echo "Delete jobs:"
+echo "  kubectl delete job fluss-producer flink-aggregator"
+
diff --git a/e2e-iot/fluss_flink_realtime/k8s/fluss-values-kind.yaml b/e2e-iot/fluss_flink_realtime/k8s/fluss-values-kind.yaml
new file mode 100644
index 0000000..da80651
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/k8s/fluss-values-kind.yaml
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+persistence:
+  enabled: false
+image:
+  registry: docker.io
+  repository: apache/fluss
+  tag: 0.8.0-incubating
+configurationOverrides:
+  "zookeeper.address": zk-svc.default.svc.cluster.local:2181
diff --git a/e2e-iot/fluss_flink_realtime/k8s/k8s-flink-aggregator-job.yaml b/e2e-iot/fluss_flink_realtime/k8s/k8s-flink-aggregator-job.yaml
new file mode 100644
index 0000000..8f4d2ca
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/k8s/k8s-flink-aggregator-job.yaml
@@ -0,0 +1,82 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: flink-aggregator
+  labels:
+    app: flink-aggregator
+spec:
+  backoffLimit: 0  # Don't retry on failure
+  completions: 1    # Only 1 completion needed
+  parallelism: 1   # Only 1 pod at a time
+  template:
+    metadata:
+      labels:
+        app: flink-aggregator
+    spec:
+      restartPolicy: Never
+      initContainers:
+      - name: wait-for-fluss
+        image: busybox:1.36
+        command:
+        - sh
+        - -c
+        - |
+          echo "Waiting for Fluss coordinator to be ready..."
+          COORD_HOST="coordinator-server-hs.default.svc.cluster.local"
+          # Use ping -4 to force IPv4 and test basic connectivity, then use nc for port check
+          # First ensure we can resolve to IPv4
+          until ping -4 -c 1 -W 1 "$COORD_HOST" >/dev/null 2>&1; do
+            echo "Waiting for Fluss DNS resolution..."
+            sleep 2
+          done
+          # Now check if the port is open using nc (with resolved IP, it should use IPv4)
+          until nc -zv "$COORD_HOST" 9124 2>&1 | grep -q "open"; do
+            echo "Waiting for Fluss on port 9124..."
+            sleep 2
+          done
+          echo "Fluss coordinator is ready!"
+      containers:
+      - name: aggregator
+        image: fluss-demo:latest
+        imagePullPolicy: Never  # Use local image loaded into Kind
+        securityContext:
+          runAsUser: 0  # Run as root to allow writing to /etc/hosts
+        command:
+        - /app/entrypoint.sh
+        args:
+        - -cp
+        - /app/fluss-flink-realtime-demo.jar
+        - org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob
+        - --bootstrap
+        - coordinator-server-hs.default.svc.cluster.local:9124
+        - --database
+        - iot
+        - --table
+        - sensor_readings
+        - --window-minutes
+        - "1"
+        resources:
+          requests:
+            memory: "1Gi"
+            cpu: "500m"
+          limits:
+            memory: "2Gi"
+            cpu: "1000m"
+
diff --git a/e2e-iot/fluss_flink_realtime/k8s/k8s-producer-job.yaml b/e2e-iot/fluss_flink_realtime/k8s/k8s-producer-job.yaml
new file mode 100644
index 0000000..cb56ed8
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/k8s/k8s-producer-job.yaml
@@ -0,0 +1,87 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: fluss-producer
+  labels:
+    app: fluss-producer
+spec:
+  backoffLimit: 0  # Don't retry on failure
+  completions: 1    # Only 1 completion needed
+  parallelism: 1   # Only 1 pod at a time
+  template:
+    metadata:
+      labels:
+        app: fluss-producer
+    spec:
+      restartPolicy: Never
+      initContainers:
+      - name: wait-for-fluss
+        image: busybox:1.36
+        command:
+        - sh
+        - -c
+        - |
+          echo "Waiting for Fluss coordinator to be ready..."
+          COORD_HOST="coordinator-server-hs.default.svc.cluster.local"
+          # Use ping -4 to force IPv4 and test basic connectivity, then use nc for port check
+          # First ensure we can resolve to IPv4
+          until ping -4 -c 1 -W 1 "$COORD_HOST" >/dev/null 2>&1; do
+            echo "Waiting for Fluss DNS resolution..."
+            sleep 2
+          done
+          # Now check if the port is open using nc (with resolved IP, it should use IPv4)
+          until nc -zv "$COORD_HOST" 9124 2>&1 | grep -q "open"; do
+            echo "Waiting for Fluss on port 9124..."
+            sleep 2
+          done
+          echo "Fluss coordinator is ready!"
+      containers:
+      - name: producer
+        image: fluss-demo:latest
+        imagePullPolicy: Never  # Use local image loaded into Kind
+        securityContext:
+          runAsUser: 0  # Run as root to allow writing to /etc/hosts
+        command:
+        - /app/entrypoint.sh
+        args:
+        - -jar
+        - /app/fluss-flink-realtime-demo.jar
+        - --bootstrap
+        - coordinator-server-hs.default.svc.cluster.local:9124
+        - --database
+        - iot
+        - --table
+        - sensor_readings
+        - --buckets
+        - "12"
+        - --rate
+        - "2000"
+        - --flush
+        - "5000"
+        - --stats
+        - "1000"
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "200m"
+          limits:
+            memory: "1Gi"
+            cpu: "500m"
+
diff --git a/e2e-iot/fluss_flink_realtime/k8s/kind-cluster-config.yaml b/e2e-iot/fluss_flink_realtime/k8s/kind-cluster-config.yaml
new file mode 100644
index 0000000..12ef212
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/k8s/kind-cluster-config.yaml
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+  - role: control-plane
+    kubeadmConfigPatches:
+      - |
+        kind: InitConfiguration
+        nodeRegistration:
+          kubeletExtraArgs:
+            max-pods: "150"
+    extraPortMappings:
+      - containerPort: 30181
+        hostPort: 8081
+        protocol: TCP
+      - containerPort: 30923
+        hostPort: 9123
+        protocol: TCP
+      - containerPort: 30924
+        hostPort: 9124
+        protocol: TCP
+  - role: worker
+  - role: worker
diff --git a/e2e-iot/fluss_flink_realtime/k8s/zookeeper-kind.yaml b/e2e-iot/fluss_flink_realtime/k8s/zookeeper-kind.yaml
new file mode 100644
index 0000000..2b0104c
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/k8s/zookeeper-kind.yaml
@@ -0,0 +1,64 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: zk-svc
+  namespace: default
+  labels:
+    app: zookeeper
+spec:
+  selector:
+    app: zookeeper
+  ports:
+    - name: client
+      port: 2181
+      targetPort: 2181
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: zk
+  namespace: default
+  labels:
+    app: zookeeper
+spec:
+  serviceName: zk-svc
+  replicas: 1
+  selector:
+    matchLabels:
+      app: zookeeper
+  template:
+    metadata:
+      labels:
+        app: zookeeper
+    spec:
+      containers:
+        - name: zookeeper
+          image: zookeeper:3.9.2
+          imagePullPolicy: IfNotPresent
+          ports:
+            - name: client
+              containerPort: 2181
+          resources:
+            requests:
+              cpu: "200m"
+              memory: "256Mi"
+            limits:
+              cpu: "500m"
+              memory: "512Mi"
diff --git a/e2e-iot/fluss_flink_realtime/kind_cluster_demo.md b/e2e-iot/fluss_flink_realtime/kind_cluster_demo.md
new file mode 100644
index 0000000..d8b2eaf
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/kind_cluster_demo.md
@@ -0,0 +1,308 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Fluss + Flink Demo on Kind Kubernetes Cluster
+
+This guide walks through deploying Fluss on a local Kind cluster and running the producer + Flink aggregator job against it.
+
+## Quick Start (Automated)
+
+For a fully automated setup, run:
+
+```bash
+cd /Users/vijayabhaskarv/IOT/FLUSS/demos/demo/fluss_flink_realtime_demo
+./run_kind_demo.sh
+```
+
+This script will:
+1. Build the demo JAR
+2. Deploy Fluss on Kind
+3. Start local Flink cluster
+4. Start the producer
+5. Submit the Flink aggregation job
+
+See the script output for monitoring commands and cleanup instructions.
+
+## Manual Setup (Step-by-Step)
+
+## Prerequisites
+
+- Docker Desktop (or Docker Engine) running
+- `kind` CLI installed (`brew install kind` or see https://kind.sigs.k8s.io/)
+- `kubectl` CLI installed
+- `helm` CLI installed (`brew install helm`)
+- Maven installed (for building the demo jar)
+- Flink 1.20.3 installed locally (for running Flink jobs)
+
+## Step 1: Build the Demo JAR
+
+From `/Users/vijayabhaskarv/IOT/FLUSS`:
+
+```bash
+mvn -f demos/demo/fluss_flink_realtime_demo/pom.xml clean package
+```
+
+Output: `demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar`
+
+## Step 2: Deploy Fluss on Kind
+
+From `/Users/vijayabhaskarv/IOT/FLUSS/demos/demo/fluss_flink_realtime_demo`:
+
+```bash
+# Deploy Fluss on Kind (this script creates the cluster, deploys ZooKeeper, and installs Fluss)
+./deploy_fluss_kind.sh
+```
+
+This script will:
+1. Create a Kind cluster named `fluss-kind`
+2. Deploy ZooKeeper
+3. Install Fluss via Helm chart
+4. Expose Fluss on `localhost:9123` (via port mapping)
+
+Wait for all pods to be ready:
+
+```bash
+kubectl get pods -n default
+# Wait until all pods show STATUS=Running and READY=1/1
+```
+
+Verify Fluss is accessible:
+
+```bash
+# Check Fluss service
+kubectl get svc -n default | grep fluss
+
+# Test connectivity (should return metadata)
+java -cp demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussMetadataInspector localhost:9123
+```
+
+## Step 3: Start Local Flink Cluster
+
+From `/Users/vijayabhaskarv/IOT/FLUSS`:
+
+```bash
+./flink-1.20.3/bin/start-cluster.sh
+```
+
+Verify Flink is running:
+
+```bash
+# Check Flink Web UI (should be accessible at http://localhost:8081)
+curl http://localhost:8081/overview
+```
+
+## Step 4: Run the Producer (Terminal 1)
+
+From `/Users/vijayabhaskarv/IOT/FLUSS`, start the producer that writes to Fluss on Kind:
+
+```bash
+java -jar demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  --bootstrap localhost:9123 \
+  --database iot \
+  --table sensor_readings \
+  --buckets 12 \
+  --rate 2000 \
+  --flush 5000 \
+  --stats 1000
+```
+
+The producer will:
+- Create the `iot.sensor_readings` table in Fluss (if it doesn't exist)
+- Continuously generate and upsert sensor data
+- Log throughput statistics every 1000 records
+
+**Note:** Keep this running. Press `Ctrl+C` to stop when done.
+
+## Step 5: Run the Flink Aggregation Job (Terminal 2)
+
+From `/Users/vijayabhaskarv/IOT/FLUSS`, in a **separate terminal**, submit the Flink job:
+
+```bash
+./flink-1.20.3/bin/flink run \
+  -c org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob \
+  demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  --bootstrap localhost:9123 \
+  --database iot \
+  --table sensor_readings \
+  --window-minutes 1
+```
+
+The Flink job will:
+- Read changelog events from the Fluss primary-key table
+- Filter for INSERT/UPDATE_AFTER events
+- Aggregate by sensor ID in 1-minute tumbling windows
+- Print aggregated results to TaskManager logs
+
+## Step 6: Monitor the Pipeline
+
+### View Flink Job Status
+
+```bash
+# List running jobs
+./flink-1.20.3/bin/flink list
+
+# View job details in Web UI
+open http://localhost:8081
+```
+
+### View Flink TaskManager Logs
+
+```bash
+# Find the TaskManager log file
+tail -f flink-1.20.3/log/flink-*-taskexecutor-*.log
+
+# Or view aggregated output
+grep "SensorAggregate" flink-1.20.3/log/flink-*-taskexecutor-*.log
+```
+
+### Inspect Fluss Data
+
+```bash
+# List databases
+java -cp demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussMetadataInspector localhost:9123
+
+# Peek at change log (while producer is running)
+java --add-opens=java.base/java.nio=ALL-UNNAMED \
+  -cp demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussTableLogPeek localhost:9123 iot sensor_readings 10
+
+# Peek at primary-key snapshot
+java --add-opens=java.base/java.nio=ALL-UNNAMED \
+  -cp demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar \
+  org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussPrimaryKeySnapshotPeek localhost:9123 iot sensor_readings 10
+```
+
+### View Fluss Pod Logs
+
+```bash
+# Coordinator logs
+kubectl logs -n default -l app=fluss-coordinator --tail=50 -f
+
+# Tablet server logs
+kubectl logs -n default -l app=fluss-tablet-server --tail=50 -f
+```
+
+## Step 7: Cleanup
+
+### Stop the Producer and Flink Job
+
+1. Press `Ctrl+C` in the producer terminal (Terminal 1)
+2. Cancel the Flink job:
+   ```bash
+   ./flink-1.20.3/bin/flink cancel <JobID>
+   # Or list and cancel: ./flink-1.20.3/bin/flink list
+   ```
+
+### Stop Local Flink Cluster
+
+```bash
+./flink-1.20.3/bin/stop-cluster.sh
+```
+
+### Delete Kind Cluster
+
+```bash
+# Delete the entire Kind cluster (this removes all Fluss pods and data)
+kind delete cluster --name fluss-kind
+```
+
+Or use the cleanup script if available:
+
+```bash
+# If you have a cleanup script
+./cleanup_fluss_kind.sh
+```
+
+## Troubleshooting
+
+### Fluss Not Accessible on localhost:9123
+
+```bash
+# Check if port mapping is correct
+kubectl get svc -n default | grep fluss
+
+# Check if Fluss pods are running
+kubectl get pods -n default | grep fluss
+
+# Check Fluss coordinator logs
+kubectl logs -n default -l app=fluss-coordinator --tail=100
+```
+
+### Flink Job Fails to Connect
+
+- Verify Fluss is accessible: `java -cp ... FlussMetadataInspector localhost:9123`
+- Check Flink TaskManager logs for connection errors
+- Ensure the bootstrap address is `localhost:9123` (not `localhost:9124`)
+
+### Producer Not Writing Data
+
+- Check producer logs for errors
+- Verify Fluss table exists: `FlussMetadataInspector localhost:9123 iot`
+- Check Fluss tablet server logs: `kubectl logs -l app=fluss-tablet-server --tail=50`
+
+### Flink Job Shows No Output
+
+- Ensure producer is running and generating data
+- Check Flink TaskManager logs for errors
+- Verify the job is reading from the correct table: `--database iot --table sensor_readings`
+- Check if watermarks are advancing (may need to wait for window to close)
+
+## Architecture Overview
+
+```
+┌─────────────────┐
+│  Producer App   │ (Local JVM)
+│  (Terminal 1)   │
+└────────┬────────┘
+         │ writes via Fluss Java SDK
+         ▼
+┌─────────────────────────────────┐
+│  Kind Cluster                   │
+│  ┌───────────────────────────┐ │
+│  │ Fluss Coordinator         │ │
+│  │ (port 9123 → localhost)   │ │
+│  └───────────────────────────┘ │
+│  ┌───────────────────────────┐ │
+│  │ Fluss Tablet Servers (x3) │ │
+│  │ - Stores primary-key data │ │
+│  │ - Maintains change log    │ │
+│  └───────────────────────────┘ │
+│  ┌───────────────────────────┐ │
+│  │ ZooKeeper                 │ │
+│  └───────────────────────────┘ │
+└────────┬─────────────────────────┘
+         │ reads changelog stream
+         ▼
+┌─────────────────┐
+│  Flink Job      │ (Local Flink Cluster)
+│  (Terminal 2)   │
+│  - Aggregates   │
+│  - Prints       │
+└─────────────────┘
+```
+
+## Next Steps
+
+- Increase producer rate: `--rate 5000` or `--rate 10000`
+- Adjust window size: `--window-minutes 5` for 5-minute windows
+- Scale Fluss tablet servers: Edit Helm values and upgrade
+- Add more Flink jobs reading from the same Fluss table
+- Write Flink output to another Fluss table or external sink
+
diff --git a/e2e-iot/fluss_flink_realtime/pom.xml b/e2e-iot/fluss_flink_realtime/pom.xml
new file mode 100644
index 0000000..85276fa
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/pom.xml
@@ -0,0 +1,177 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>org.apache.fluss.benchmark.e2eplatformaws</groupId>
+    <artifactId>fluss-flink-realtime-demo</artifactId>
+    <version>0.1.0-SNAPSHOT</version>
+    <name>Fluss + Flink Realtime Demo</name>
+    <description>Sample Fluss producer and Flink analytics job</description>
+
+    <properties>
+        <maven.compiler.source>17</maven.compiler.source>
+        <maven.compiler.target>17</maven.compiler.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <fluss.version>0.8.0-incubating</fluss.version>
+        <flink.version>1.20.3</flink.version>
+    </properties>
+
+    <dependencies>
+        <!-- Fluss client APIs -->
+        <dependency>
+            <groupId>org.apache.fluss</groupId>
+            <artifactId>fluss-client</artifactId>
+            <version>${fluss.version}</version>
+        </dependency>
+
+        <!-- Flink integration for Fluss (1.20 matching local Flink distribution) -->
+        <dependency>
+            <groupId>org.apache.fluss</groupId>
+            <artifactId>fluss-flink-1.20</artifactId>
+            <version>${fluss.version}</version>
+        </dependency>
+
+        <!-- Flink streaming/runtime APIs 
+             Note: When running as standalone (e.g., in Kubernetes jobs), these need to be included.
+             When submitting to Flink cluster, these would typically be provided by the cluster. -->
+        <dependency>
+            <groupId>org.apache.flink</groupId>
+            <artifactId>flink-streaming-java</artifactId>
+            <version>${flink.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.flink</groupId>
+            <artifactId>flink-clients</artifactId>
+            <version>${flink.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.flink</groupId>
+            <artifactId>flink-table-common</artifactId>
+            <version>${flink.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.flink</groupId>
+            <artifactId>flink-table-api-java</artifactId>
+            <version>${flink.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.flink</groupId>
+            <artifactId>flink-table-api-java-bridge</artifactId>
+            <version>${flink.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.flink</groupId>
+            <artifactId>flink-table-planner-loader</artifactId>
+            <version>${flink.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.flink</groupId>
+            <artifactId>flink-table-runtime</artifactId>
+            <version>${flink.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.flink</groupId>
+            <artifactId>flink-connector-base</artifactId>
+            <version>${flink.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.flink</groupId>
+            <artifactId>flink-metrics-prometheus</artifactId>
+            <version>${flink.version}</version>
+        </dependency>
+
+        <!-- Utilities -->
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-lang3</artifactId>
+            <version>3.14.0</version>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+            <version>2.17.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-api</artifactId>
+            <version>2.0.13</version>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-simple</artifactId>
+            <version>2.0.13</version>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.11.0</version>
+                <configuration>
+                    <release>17</release>
+                </configuration>
+            </plugin>
+
+            <!-- Create an executable fat-jar so both the producer and Flink job can be launched easily -->
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>3.5.1</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>false</createDependencyReducedPom>
+                            <finalName>${project.artifactId}</finalName>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>org.apache.fluss.benchmark.e2eplatformaws.producer.FlussSensorProducerApp</mainClass>
+                                </transformer>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                            </transformers>
+                            <filters>
+                            <filter>
+                                <artifact>org.apache.fluss:fluss-client</artifact>
+                                <excludes>
+                                    <exclude>org/slf4j/**</exclude>
+                                </excludes>
+                            </filter>
+                                <filter>
+                                    <artifact>org.apache.fluss:fluss-flink-1.20</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/services/org.apache.flink.table.factories.Factory</exclude>
+                                        <exclude>com/alibaba/**</exclude>
+                                    <exclude>org/slf4j/**</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+</project>
diff --git a/e2e-iot/fluss_flink_realtime/run_kind_demo.sh b/e2e-iot/fluss_flink_realtime/run_kind_demo.sh
new file mode 100755
index 0000000..ec55cb4
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/run_kind_demo.sh
@@ -0,0 +1,175 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Automation script to deploy Fluss on Kind and run the producer + Flink aggregator demo.
+# This script automates the steps in kind_cluster_demo.md
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# Navigate to project root (3 levels up from this script)
+WORKDIR=$(cd "${SCRIPT_DIR}/../../.." && pwd)
+cd "${WORKDIR}"
+
+KIND_NAME=${KIND_NAME:-fluss-kind}
+DEMO_JAR="${WORKDIR}/demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar"
+FLINK_HOME="${WORKDIR}/flink-1.20.3"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}=== Fluss + Flink Kind Cluster Demo ===${NC}\n"
+
+# Step 1: Build demo JAR
+if [ ! -f "${DEMO_JAR}" ]; then
+    echo -e "${YELLOW}[1/6] Building demo JAR...${NC}"
+    mvn -f "${WORKDIR}/demos/demo/fluss_flink_realtime_demo/pom.xml" clean package
+else
+    echo -e "${GREEN}[1/6] Demo JAR already exists, skipping build${NC}"
+fi
+
+# Step 2: Deploy Fluss on Kind
+echo -e "\n${YELLOW}[2/6] Deploying Fluss on Kind cluster...${NC}"
+if kind get clusters | grep -q "^${KIND_NAME}$"; then
+    echo -e "${YELLOW}Kind cluster '${KIND_NAME}' already exists. Delete it first with:${NC}"
+    echo -e "  ${RED}kind delete cluster --name ${KIND_NAME}${NC}"
+    exit 1
+fi
+
+"${SCRIPT_DIR}/k8s/deploy_fluss_kind.sh"
+
+# Wait a bit for services to stabilize
+echo -e "\n${YELLOW}Waiting for Fluss to be ready...${NC}"
+sleep 10
+
+# Fix: Patch coordinator to advertise IPv4 IP instead of hostname to prevent IPv6 resolution issues
+echo -e "\n${YELLOW}Patching coordinator to use IPv4 addresses...${NC}"
+kubectl patch statefulset coordinator-server --type='json' -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/command/2", "value": "export FLUSS_SERVER_ID=${POD_NAME##*-} && cp /opt/conf/server.yaml $FLUSS_HOME/conf && echo \"\" >> $FLUSS_HOME/conf/server.yaml && echo \"tablet-server.id: ${FLUSS_SERVER_ID}\" >> $FLUSS_HOME/conf/server.yaml && echo \"bind.listeners: INTERNAL://0.0.0.0:9122, CLIENT://0.0.0.0:9124\" >> $FLUSS_HOME/conf/server.yaml && echo \"advertised.listeners: CLIENT://${POD_IP}:9124\" >> $FLUSS_HOME/conf/server.yaml && bin/coordinator-server.sh start-foreground"}]' >/dev/null 2>&1
+# Restart coordinator pod to apply the patch
+kubectl delete pod coordinator-server-0 --wait=false >/dev/null 2>&1
+# Wait for coordinator to be ready
+kubectl wait --for=condition=Ready pod coordinator-server-0 --timeout=60s >/dev/null 2>&1
+echo -e "${GREEN}✓ Coordinator patched to advertise IPv4 IP addresses${NC}"
+
+# Set up port forwarding to coordinator service on port 9124
+echo -e "\n${YELLOW}Setting up port forwarding to coordinator (port 9124)...${NC}"
+kubectl port-forward svc/coordinator-server-hs 9124:9124 >/dev/null 2>&1 &
+PORT_FORWARD_PID=$!
+sleep 3
+
+# Verify Fluss is accessible
+echo -e "\n${YELLOW}[3/6] Verifying Fluss connectivity...${NC}"
+if ! java -cp "${DEMO_JAR}" \
+    org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussMetadataInspector localhost:9124 >/dev/null 2>&1; then
+    echo -e "${RED}ERROR: Cannot connect to Fluss on localhost:9124${NC}"
+    echo -e "${YELLOW}Check Fluss pods: kubectl get pods${NC}"
+    kill $PORT_FORWARD_PID 2>/dev/null || true
+    exit 1
+fi
+echo -e "${GREEN}✓ Fluss is accessible on localhost:9124${NC}"
+
+# Step 3: Start Flink cluster
+echo -e "\n${YELLOW}[4/6] Starting local Flink cluster...${NC}"
+if [ ! -d "${FLINK_HOME}" ]; then
+    echo -e "${RED}ERROR: Flink not found at ${FLINK_HOME}${NC}"
+    exit 1
+fi
+
+if pgrep -f "flink.*standalonesession" >/dev/null; then
+    echo -e "${GREEN}Flink cluster already running${NC}"
+else
+    "${FLINK_HOME}/bin/start-cluster.sh" >/dev/null 2>&1
+    sleep 5
+    echo -e "${GREEN}✓ Flink cluster started${NC}"
+fi
+
+# Step 4: Start producer in background
+echo -e "\n${YELLOW}[5/6] Starting producer (background)...${NC}"
+PRODUCER_LOG="${WORKDIR}/producer.log"
+java -jar "${DEMO_JAR}" \
+    --bootstrap localhost:9124 \
+    --database iot \
+    --table sensor_readings \
+    --buckets 12 \
+    --rate 2000 \
+    --flush 5000 \
+    --stats 1000 \
+    > "${PRODUCER_LOG}" 2>&1 &
+PRODUCER_PID=$!
+echo -e "${GREEN}✓ Producer started (PID: ${PRODUCER_PID}, log: ${PRODUCER_LOG})${NC}"
+
+# Wait a bit for producer to create table and start writing
+sleep 5
+
+# Step 5: Submit Flink job
+echo -e "\n${YELLOW}[6/6] Submitting Flink aggregation job...${NC}"
+FLINK_LOG="${WORKDIR}/flink-job.log"
+"${FLINK_HOME}/bin/flink run" \
+    -c org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob \
+    "${DEMO_JAR}" \
+    --bootstrap localhost:9124 \
+    --database iot \
+    --table sensor_readings \
+    --window-minutes 1 \
+    > "${FLINK_LOG}" 2>&1 &
+FLINK_SUBMIT_PID=$!
+
+# Wait a moment for job submission
+sleep 3
+
+# Check if job was submitted successfully
+if grep -q "Job has been submitted" "${FLINK_LOG}" 2>/dev/null; then
+    JOB_ID=$(grep "Job has been submitted" "${FLINK_LOG}" | grep -oE "JobID [a-f0-9]+" | awk '{print $2}')
+    echo -e "${GREEN}✓ Flink job submitted (JobID: ${JOB_ID})${NC}"
+    echo -e "${GREEN}  View job status: ${FLINK_HOME}/bin/flink list${NC}"
+    echo -e "${GREEN}  View Flink UI: http://localhost:8081${NC}"
+else
+    echo -e "${RED}ERROR: Flink job submission may have failed. Check ${FLINK_LOG}${NC}"
+fi
+
+# Summary
+echo -e "\n${GREEN}=== Demo is running! ===${NC}\n"
+echo -e "Producer:"
+echo -e "  PID: ${PRODUCER_PID}"
+echo -e "  Log: tail -f ${PRODUCER_LOG}"
+echo -e "  Stop: kill ${PRODUCER_PID}"
+echo -e ""
+echo -e "Flink Job:"
+echo -e "  Log: tail -f ${FLINK_LOG}"
+echo -e "  Status: ${FLINK_HOME}/bin/flink list"
+echo -e "  UI: http://localhost:8081"
+echo -e "  TaskManager logs: tail -f ${FLINK_HOME}/log/flink-*-taskexecutor-*.log"
+echo -e ""
+echo -e "Fluss (Kind):"
+echo -e "  Check pods: kubectl get pods"
+echo -e "  Coordinator logs: kubectl logs -l app=fluss-coordinator --tail=50 -f"
+echo -e "  Tablet server logs: kubectl logs -l app=fluss-tablet-server --tail=50 -f"
+echo -e ""
+echo -e "Inspect Fluss data:"
+echo -e "  java -cp ${DEMO_JAR} org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussMetadataInspector localhost:9124"
+echo -e ""
+echo -e "${YELLOW}To stop everything:${NC}"
+echo -e "  1. kill ${PRODUCER_PID}"
+echo -e "  2. kill ${PORT_FORWARD_PID}  # Stop port forwarding"
+echo -e "  3. ${FLINK_HOME}/bin/flink cancel <JobID>"
+echo -e "  4. ${FLINK_HOME}/bin/stop-cluster.sh"
+echo -e "  5. kind delete cluster --name ${KIND_NAME}"
+
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/flink/FlinkMetrics.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/flink/FlinkMetrics.java
new file mode 100644
index 0000000..080170d
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/flink/FlinkMetrics.java
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.flink;
+
+import org.apache.flink.metrics.Metric;
+import org.apache.flink.metrics.MetricGroup;
+import org.apache.flink.metrics.Gauge;
+import org.apache.flink.metrics.Counter;
+import org.apache.flink.metrics.Meter;
+import org.apache.flink.runtime.metrics.MetricRegistry;
+import org.apache.flink.runtime.metrics.groups.AbstractMetricGroup;
+import org.apache.flink.runtime.metrics.scope.ScopeFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.Serializable;
+import java.net.InetSocketAddress;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.LongAdder;
+
+import com.sun.net.httpserver.HttpServer;
+
+/**
+ * Simple Prometheus metrics server for Flink aggregator.
+ * Exposes metrics on port 9249 at /metrics endpoint.
+ */
+public class FlinkMetrics implements Serializable {
+    private static final long serialVersionUID = 1L;
+    private static final Logger LOG = LoggerFactory.getLogger(FlinkMetrics.class);
+    
+    final LongAdder recordsIn = new LongAdder();
+    final LongAdder recordsOut = new LongAdder();
+    final AtomicLong startTime = new AtomicLong(System.currentTimeMillis());
+    private final AtomicLong lastUpdateTime = new AtomicLong(System.currentTimeMillis());
+    
+    // Track additional metrics
+    private final AtomicLong eventTimeLag = new AtomicLong(0);
+    private final Map<String, Long> bucketOffsets = new ConcurrentHashMap<>();
+    private final AtomicLong backpressureTime = new AtomicLong(0);
+    
+    private HttpServer server;
+    private final int port;
+    
+    public FlinkMetrics(int port) {
+        this.port = port;
+    }
+    
+    public void start() throws IOException {
+        server = HttpServer.create(new InetSocketAddress(port), 0);
+        server.createContext("/metrics", this::handleMetrics);
+        server.setExecutor(null); // Use default executor
+        server.start();
+        LOG.info("Flink metrics server started on port {}", port);
+    }
+    
+    public void stop() {
+        if (server != null) {
+            server.stop(0);
+            LOG.info("Flink metrics server stopped");
+        }
+    }
+    
+    public void recordInput() {
+        recordsIn.increment();
+        lastUpdateTime.set(System.currentTimeMillis());
+    }
+    
+    public void recordOutput() {
+        recordsOut.increment();
+        lastUpdateTime.set(System.currentTimeMillis());
+    }
+    
+    public void updateEventTimeLag(long lagMs) {
+        eventTimeLag.set(lagMs);
+    }
+    
+    public void updateBucketOffset(String bucket, long offset) {
+        bucketOffsets.put(bucket, offset);
+    }
+    
+    public void updateBackpressure(long backpressureMs) {
+        backpressureTime.set(backpressureMs);
+    }
+    
+    public long getRecordsIn() {
+        return recordsIn.sum();
+    }
+    
+    public long getStartTime() {
+        return startTime.get();
+    }
+    
+    private void handleMetrics(com.sun.net.httpserver.HttpExchange exchange) throws IOException {
+        long currentTime = System.currentTimeMillis();
+        long in = recordsIn.sum();
+        long out = recordsOut.sum();
+        long elapsedSeconds = (currentTime - startTime.get()) / 1000;
+        
+        double inRate = elapsedSeconds > 0 ? (double) in / elapsedSeconds : 0.0;
+        double outRate = elapsedSeconds > 0 ? (double) out / elapsedSeconds : 0.0;
+        
+        StringBuilder response = new StringBuilder();
+        response.append("# HELP flink_taskmanager_job_task_operator_numRecordsIn Total number of input records\n");
+        response.append("# TYPE flink_taskmanager_job_task_operator_numRecordsIn counter\n");
+        response.append("flink_taskmanager_job_task_operator_numRecordsIn ").append(in).append("\n");
+        
+        response.append("# HELP flink_taskmanager_job_task_operator_numRecordsOut Total number of output records\n");
+        response.append("# TYPE flink_taskmanager_job_task_operator_numRecordsOut counter\n");
+        response.append("flink_taskmanager_job_task_operator_numRecordsOut ").append(out).append("\n");
+        
+        response.append("# HELP flink_taskmanager_job_task_operator_numRecordsInPerSecond Input records per second\n");
+        response.append("# TYPE flink_taskmanager_job_task_operator_numRecordsInPerSecond gauge\n");
+        response.append("flink_taskmanager_job_task_operator_numRecordsInPerSecond ").append(String.format("%.2f", inRate)).append("\n");
+        
+        response.append("# HELP flink_taskmanager_job_task_operator_numRecordsOutPerSecond Output records per second\n");
+        response.append("# TYPE flink_taskmanager_job_task_operator_numRecordsOutPerSecond gauge\n");
+        response.append("flink_taskmanager_job_task_operator_numRecordsOutPerSecond ").append(String.format("%.2f", outRate)).append("\n");
+        
+        response.append("# HELP flink_taskmanager_job_task_operator_uptime_seconds Flink job uptime in seconds\n");
+        response.append("# TYPE flink_taskmanager_job_task_operator_uptime_seconds gauge\n");
+        response.append("flink_taskmanager_job_task_operator_uptime_seconds ").append(elapsedSeconds).append("\n");
+        
+        // Event Time Lag
+        response.append("# HELP flink_taskmanager_job_task_operator_currentFetchEventTimeLag Event time lag in milliseconds\n");
+        response.append("# TYPE flink_taskmanager_job_task_operator_currentFetchEventTimeLag gauge\n");
+        response.append("flink_taskmanager_job_task_operator_currentFetchEventTimeLag ").append(eventTimeLag.get()).append("\n");
+        
+        // Current Offset per Bucket
+        response.append("# HELP flink_taskmanager_job_task_operator_fluss_reader_bucket_currentOffset Current offset per bucket\n");
+        response.append("# TYPE flink_taskmanager_job_task_operator_fluss_reader_bucket_currentOffset gauge\n");
+        for (Map.Entry<String, Long> entry : bucketOffsets.entrySet()) {
+            response.append("flink_taskmanager_job_task_operator_fluss_reader_bucket_currentOffset{bucket=\"").append(entry.getKey()).append("\"} ").append(entry.getValue()).append("\n");
+        }
+        
+        // Backpressure (estimated based on input/output rate difference)
+        long backpressureMs = 0;
+        if (inRate > 0 && outRate > 0 && inRate > outRate) {
+            // Estimate backpressure: if input rate > output rate, there's backpressure
+            // Calculate as milliseconds of backpressure per second
+            double rateDiff = inRate - outRate;
+            backpressureMs = (long) ((rateDiff / inRate) * 1000); // Convert to ms per second
+        }
+        response.append("# HELP flink_taskmanager_job_task_backPressuredTimeMsPerSecond Backpressure time in milliseconds per second\n");
+        response.append("# TYPE flink_taskmanager_job_task_backPressuredTimeMsPerSecond gauge\n");
+        response.append("flink_taskmanager_job_task_backPressuredTimeMsPerSecond ").append(backpressureMs).append("\n");
+        
+        String responseStr = response.toString();
+        exchange.sendResponseHeaders(200, responseStr.length());
+        try (OutputStream os = exchange.getResponseBody()) {
+            os.write(responseStr.getBytes());
+        }
+    }
+}
+
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/flink/FlinkSensorAggregatorJob.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/flink/FlinkSensorAggregatorJob.java
new file mode 100644
index 0000000..01f9333
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/flink/FlinkSensorAggregatorJob.java
@@ -0,0 +1,761 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.flink;
+
+import org.apache.fluss.benchmark.e2eplatformaws.model.SensorData;
+import org.apache.flink.api.common.functions.AggregateFunction;
+import org.apache.flink.api.common.functions.RichMapFunction;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.metrics.Counter;
+import org.apache.flink.metrics.Gauge;
+import org.apache.flink.metrics.MetricGroup;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
+import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
+import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
+import org.apache.flink.streaming.api.windowing.time.Time;
+import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
+import org.apache.flink.table.api.EnvironmentSettings;
+import org.apache.flink.table.api.Table;
+import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
+import org.apache.flink.table.data.StringData;
+import org.apache.flink.table.data.TimestampData;
+import org.apache.flink.types.Row;
+import org.apache.flink.types.RowKind;
+import org.apache.flink.util.Collector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.time.Duration;
+import java.time.Instant;
+import java.util.Locale;
+import java.util.Objects;
+
+/**
+ * Flink streaming job that reads the primary-key table written by {@link
+ * org.apache.fluss.benchmark.e2eplatformaws.producer.FlussSensorProducerApp}, performs tumbling-window aggregations, and
+ * prints the results. The logic mirrors the Pulsar → Flink → ClickHouse path from the original
+ * RealtimeDataPlatform example but uses Fluss as both the source and storage.
+ */
+public final class FlinkSensorAggregatorJob {
+    private static final Logger LOG = LoggerFactory.getLogger(FlinkSensorAggregatorJob.class);
+
+    // Set IPv4-only properties in static initializer to ensure they're set before any class loading
+    static {
+        System.setProperty("java.net.preferIPv4Stack", "true");
+        System.setProperty("java.net.preferIPv4Addresses", "true");
+    }
+
+    public static void main(String[] args) throws Exception {
+        // Ensure IPv4 properties are set
+        System.setProperty("java.net.preferIPv4Stack", "true");
+        System.setProperty("java.net.preferIPv4Addresses", "true");
+        
+        JobOptions options = JobOptions.parse(args);
+        LOG.info("Starting Flink aggregation job with options: {}", options);
+
+        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+        // Enable checkpoints for fault tolerance
+        // Checkpoint configuration is set in flink-conf.yaml
+        // Using scan.startup.mode='latest' in table query to start from latest position
+        EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();
+        StreamTableEnvironment tEnv = StreamTableEnvironment.create(env, settings);
+
+        String catalogDdl = String.format(
+                Locale.ROOT,
+                "CREATE CATALOG %s WITH (\n"
+                        + "  'type' = 'fluss',\n"
+                        + "  'bootstrap.servers' = '%s',\n"
+                        + "  'default-database' = '%s'\n)",
+                options.catalog,
+                options.bootstrap,
+                options.database);
+        tEnv.executeSql(catalogDdl);
+        tEnv.executeSql("USE CATALOG " + options.catalog);
+        tEnv.executeSql("USE " + options.database);
+
+        // Read from table with scan.startup.mode='latest' to start from latest position
+        // instead of reading from the beginning
+        String tableQuery = String.format(
+                Locale.ROOT,
+                "SELECT * FROM %s /*+ OPTIONS('scan.startup.mode' = 'latest') */",
+                options.table);
+        Table sourceTable = tEnv.sqlQuery(tableQuery);
+        DataStream<Row> changelogStream = tEnv.toChangelogStream(sourceTable);
+
+        // Filter for only INSERT and UPDATE_AFTER events (ignore UPDATE_BEFORE and DELETE)
+        DataStream<Row> rowStream = changelogStream.filter(row -> {
+            RowKind kind = row.getKind();
+            return kind == RowKind.INSERT || kind == RowKind.UPDATE_AFTER;
+        })
+        .name("FlussChangelogFilter");
+
+        // Use RichMapFunction to access Flink's built-in metrics API
+        DataStream<SensorReading> sensorStream = rowStream
+                .map(new RichMapFunction<Row, SensorReading>() {
+                    private transient Counter recordsInCounter;
+                    private transient MetricGroup customMetricsGroup;
+                    
+                    // Gauge implementation for event time lag
+                    final class EventTimeLagGauge implements Gauge<Long> {
+                        private volatile long lagMs = 0L;
+                        
+                        public void update(long lagMs) {
+                            this.lagMs = lagMs;
+                        }
+                        
+                        @Override
+                        public Long getValue() {
+                            return lagMs;
+                        }
+                    }
+                    
+                    private transient EventTimeLagGauge eventTimeLagGauge;
+                    
+                    @Override
+                    public void open(Configuration parameters) throws Exception {
+                        super.open(parameters);
+                        // Get metric group for this operator
+                        MetricGroup metricGroup = getRuntimeContext().getMetricGroup();
+                        
+                        // Create custom metrics group
+                        customMetricsGroup = metricGroup.addGroup("fluss_aggregator");
+                        
+                        // Register counter for input records
+                        recordsInCounter = customMetricsGroup.counter("records_in");
+                        
+                        // Register gauge for event time lag
+                        eventTimeLagGauge = new EventTimeLagGauge();
+                        customMetricsGroup.gauge("event_time_lag_ms", eventTimeLagGauge);
+                    }
+                    
+                    @Override
+                    public SensorReading map(Row reading) throws Exception {
+                        // Increment input counter
+                        recordsInCounter.inc();
+                        
+                        SensorReading sensorReading = FlinkSensorAggregatorJob.toSensorReading(reading);
+                        
+                        // Calculate event time lag (difference between event time and current time)
+                        long eventTimeMs = sensorReading.eventTime.toEpochMilli();
+                        long currentTimeMs = System.currentTimeMillis();
+                        long lagMs = currentTimeMs - eventTimeMs;
+                        eventTimeLagGauge.update(lagMs);
+                        
+                        return sensorReading;
+                    }
+                })
+                .name("FlussSensorReadingMapper");
+
+        // Windowing is now based on PROCESSING TIME. Watermarks are not needed.
+        // Use incremental aggregation with ProcessWindowFunction for window metadata
+        // Disable chaining to allow better parallelism distribution
+        SingleOutputStreamOperator<SensorAggregate> aggregates = sensorStream
+                .keyBy(reading -> reading.sensorId)
+                .window(TumblingProcessingTimeWindows.of(Time.minutes(options.windowMinutes)))
+                .aggregate(new SensorAggregateFunction(), new WindowEnricher())
+                .name("TumblingWindowAggregation")
+                .disableChaining();  // Disable chaining to allow better resource utilization
+
+        aggregates
+                .disableChaining()  // Disable chaining to separate window and sink operators
+                .name("FlussAggregatorSink")  // Name the sink operator for metrics
+                .addSink(new RichSinkFunction<SensorAggregate>() {
+            private transient Counter recordsOutCounter;
+            private transient MetricGroup customMetricsGroup;
+            private transient long recordCount = 0L;
+            private static final long PRINT_INTERVAL = 20000L; // Print every 20000 aggregates
+            
+            @Override
+            public void open(Configuration parameters) throws Exception {
+                super.open(parameters);
+                // Get metric group for this sink
+                MetricGroup metricGroup = getRuntimeContext().getMetricGroup();
+                
+                // Create custom metrics group
+                customMetricsGroup = metricGroup.addGroup("fluss_aggregator");
+                
+                // Register counter for output records
+                recordsOutCounter = customMetricsGroup.counter("records_out");
+            }
+            
+            @Override
+            public void invoke(SensorAggregate value, Context context) throws Exception {
+                // Increment record count and counter for every record
+                recordCount++;
+                recordsOutCounter.inc();
+                
+                // Only log when actually needed (lazy evaluation)
+                // This reduces overhead for 99.995% of records
+                if (recordCount % PRINT_INTERVAL == 0) {
+                    // Convert only when printing to avoid unnecessary work
+                    SensorRecord fullRecord = toSensorRecord(value);
+                    // Use LOG instead of System.out.println for async, non-blocking logging
+                    LOG.info("Aggregate #{}: {} | Full Record: {}", recordCount, value, fullRecord);
+                }
+            }
+        })
+        .disableChaining();  // Ensure sink doesn't chain with downstream operators
+
+        env.execute("Fluss Sensor Aggregation Job");
+    }
+
+    /**
+     * Convert Fluss Row to SensorReading.
+     * Fluss table schema: sensor_id (INT), sensor_type (INT), temperature, humidity, pressure, 
+     *                     battery_level, status (INT), timestamp (BIGINT)
+     */
+    private static SensorReading toSensorReading(Row row) {
+        // Read fields from Fluss table (minimal schema)
+        int sensorIdInt = asInt(row.getField(0));
+        int sensorTypeInt = asInt(row.getField(1));
+        double temperature = asDouble(row.getField(2));
+        double humidity = asDouble(row.getField(3));
+        double pressure = asDouble(row.getField(4));
+        double battery = asDouble(row.getField(5));
+        int statusInt = asInt(row.getField(6));
+        long timestamp = asLong(row.getField(7));
+        
+        // Convert to SensorReading format
+        String sensorId = "sensor_" + sensorIdInt;
+        String sensorType = getSensorTypeString(sensorTypeInt);
+        String location = "site_" + String.format("%03d", (sensorIdInt % 100) + 1);
+        String status = statusInt == 1 ? "ONLINE" : statusInt == 2 ? "OFFLINE" : 
+                       statusInt == 3 ? "MAINTENANCE" : "ERROR";
+        Instant eventTime = Instant.ofEpochMilli(timestamp);
+        
+        // Create metadata with default values (matching JDBCFlinkConsumer.java)
+        SensorData.MetaData meta = new SensorData.MetaData(
+            "AcmeSensors",  // manufacturer - default
+            "X100",         // model - default
+            "1.0.0",        // firmware - default
+            0.0,            // latitude - default
+            0.0             // longitude - default
+        );
+        
+        return new SensorReading(sensorId, sensorType, location, temperature, humidity, pressure, battery, status, eventTime, meta);
+    }
+    
+    /**
+     * Convert SensorAggregate to SensorRecord with all fields (adding defaults matching JDBCFlinkConsumer.java).
+     * This creates a full SensorRecord that can be written to ClickHouse or other sinks.
+     */
+    private static SensorRecord toSensorRecord(SensorAggregate aggregate) {
+        SensorRecord record = new SensorRecord();
+        
+        // Extract sensor ID from aggregate (format: "sensor_12345")
+        String sensorIdStr = aggregate.sensorId;
+        int sensorIdInt = sensorIdStr.startsWith("sensor_") ? 
+            Integer.parseInt(sensorIdStr.substring(7)) : 0;
+        
+        // Device identifiers (matching JDBCFlinkConsumer.java)
+        record.device_id = sensorIdStr;
+        record.device_type = aggregate.sensorType;
+        record.customer_id = "customer_0001"; // Default value
+        record.site_id = "site_" + String.format("%03d", (sensorIdInt % 100) + 1);
+        
+        // Location data - defaults (matching JDBCFlinkConsumer.java)
+        record.latitude = 0.0;
+        record.longitude = 0.0;
+        record.altitude = 0.0;
+        
+        // Sensor readings from aggregate
+        record.temperature = aggregate.avgTemperature;
+        record.humidity = aggregate.avgHumidity;
+        record.pressure = aggregate.avgPressure;
+        
+        // Additional sensor readings - defaults (matching JDBCFlinkConsumer.java)
+        record.co2_level = 400.0;
+        record.noise_level = 50.0;
+        record.light_level = 500.0;
+        record.motion_detected = 0;
+        
+        // Device metrics
+        record.battery_level = aggregate.avgBatteryLevel;
+        
+        // Device metrics - defaults (matching JDBCFlinkConsumer.java)
+        record.signal_strength = -50.0;
+        record.memory_usage = 50.0;
+        record.cpu_usage = 30.0;
+        
+        // Status - convert from string to int
+        int statusInt = aggregate.latestStatus.equals("ONLINE") ? 1 :
+                       aggregate.latestStatus.equals("OFFLINE") ? 2 :
+                       aggregate.latestStatus.equals("MAINTENANCE") ? 3 : 4;
+        record.status = statusInt;
+        record.error_count = 0; // Default value
+        
+        // Network metrics - defaults (matching JDBCFlinkConsumer.java)
+        record.packets_sent = 0L;
+        record.packets_received = 0L;
+        record.bytes_sent = 0L;
+        record.bytes_received = 0L;
+        
+        return record;
+    }
+    
+    /**
+     * Convert integer sensor type to string (matching JDBCFlinkConsumer.java).
+     * 1=temperature, 2=humidity, 3=pressure, 4=motion, 5=light, 6=co2, 7=noise, 8=multisensor
+     */
+    private static String getSensorTypeString(int sensorType) {
+        switch (sensorType) {
+            case 1: return "temperature_sensor";
+            case 2: return "humidity_sensor";
+            case 3: return "pressure_sensor";
+            case 4: return "motion_sensor";
+            case 5: return "light_sensor";
+            case 6: return "co2_sensor";
+            case 7: return "noise_sensor";
+            case 8: return "multisensor";
+            default: return "sensor_type_" + sensorType;
+        }
+    }
+    
+    private static int asInt(Object value) {
+        if (value == null) {
+            return 0;
+        }
+        if (value instanceof Number) {
+            return ((Number) value).intValue();
+        }
+        return Integer.parseInt(value.toString());
+    }
+    
+    private static long asLong(Object value) {
+        if (value == null) {
+            return 0L;
+        }
+        if (value instanceof Number) {
+            return ((Number) value).longValue();
+        }
+        return Long.parseLong(value.toString());
+    }
+
+    private static double asDouble(Object value) {
+        if (value == null) {
+            return 0D;
+        }
+        if (value instanceof Number) {
+            return ((Number) value).doubleValue();
+        }
+        return Double.parseDouble(value.toString());
+    }
+
+    private static String asString(Object field) {
+        if (field == null) {
+            return null;
+        }
+        if (field instanceof String) {
+            return (String) field;
+        }
+        if (field instanceof StringData) {
+            return field.toString();
+        }
+        return Objects.toString(field, null);
+    }
+
+    private static Instant asInstant(Object field) {
+        if (field instanceof Instant) {
+            return (Instant) field;
+        }
+        if (field instanceof TimestampData) {
+            return ((TimestampData) field).toInstant();
+        }
+        if (field instanceof java.sql.Timestamp) {
+            return ((java.sql.Timestamp) field).toInstant();
+        }
+        throw new IllegalArgumentException("Unsupported timestamp type: " + field);
+    }
+
+    private record JobOptions(String bootstrap, String database, String table, String catalog, int windowMinutes) {
+        private static JobOptions parse(String[] args) {
+            String bootstrap = "localhost:9124";
+            String database = "iot";
+            String table = "sensor_readings";
+            String catalog = "fluss";
+            int window = 1;
+
+            for (int i = 0; i < args.length; i++) {
+                switch (args[i]) {
+                    case "--bootstrap":
+                        bootstrap = args[++i];
+                        break;
+                    case "--database":
+                        database = args[++i];
+                        break;
+                    case "--table":
+                        table = args[++i];
+                        break;
+                    case "--catalog":
+                        catalog = args[++i];
+                        break;
+                    case "--window-minutes":
+                        window = Integer.parseInt(args[++i]);
+                        break;
+                    default:
+                        throw new IllegalArgumentException("Unknown argument: " + args[i]);
+                }
+            }
+
+            return new JobOptions(bootstrap, database, table, catalog, window);
+        }
+    }
+
+    private static class SensorReading implements java.io.Serializable {
+        private static final long serialVersionUID = 1L;
+        final String sensorId;
+        final String sensorType;
+        final String location;
+        final double temperature;
+        final double humidity;
+        final double pressure;
+        final double batteryLevel;
+        final String status;
+        final Instant eventTime;
+        final SensorData.MetaData metadata;
+
+        SensorReading(String sensorId, String sensorType, String location,
+                     double temperature, double humidity, double pressure,
+                     double batteryLevel, String status, Instant eventTime,
+                     SensorData.MetaData metadata) {
+            this.sensorId = sensorId;
+            this.sensorType = sensorType;
+            this.location = location;
+            this.temperature = temperature;
+            this.humidity = humidity;
+            this.pressure = pressure;
+            this.batteryLevel = batteryLevel;
+            this.status = status;
+            this.eventTime = eventTime;
+            this.metadata = metadata;
+        }
+    }
+
+    private static class SensorAggregateFunction
+            implements AggregateFunction<SensorReading, SensorAccumulator, SensorAccumulator> {
+
+        @Override
+        public SensorAccumulator createAccumulator() {
+            return new SensorAccumulator();
+        }
+
+        @Override
+        public SensorAccumulator add(SensorReading value, SensorAccumulator accumulator) {
+            // Initialize on first record (avoid repeated null checks)
+            if (accumulator.count == 0) {
+                accumulator.sensorId = value.sensorId;
+                accumulator.sensorType = value.sensorType;
+                accumulator.location = value.location;
+                accumulator.metadata = value.metadata;
+                // Initialize min/max with first values to avoid Double.MAX_VALUE comparisons
+                accumulator.temperatureMin = value.temperature;
+                accumulator.temperatureMax = value.temperature;
+                accumulator.humidityMin = value.humidity;
+                accumulator.humidityMax = value.humidity;
+                accumulator.pressureMin = value.pressure;
+                accumulator.pressureMax = value.pressure;
+                accumulator.batteryMin = value.batteryLevel;
+                accumulator.batteryMax = value.batteryLevel;
+                accumulator.latestEventTime = value.eventTime;
+                accumulator.latestStatus = value.status;
+            } else {
+                // Optimized min/max comparisons (avoid Math.min/max overhead)
+                double temp = value.temperature;
+                if (temp < accumulator.temperatureMin) accumulator.temperatureMin = temp;
+                else if (temp > accumulator.temperatureMax) accumulator.temperatureMax = temp;
+                
+                double hum = value.humidity;
+                if (hum < accumulator.humidityMin) accumulator.humidityMin = hum;
+                else if (hum > accumulator.humidityMax) accumulator.humidityMax = hum;
+                
+                double press = value.pressure;
+                if (press < accumulator.pressureMin) accumulator.pressureMin = press;
+                else if (press > accumulator.pressureMax) accumulator.pressureMax = press;
+                
+                double bat = value.batteryLevel;
+                if (bat < accumulator.batteryMin) accumulator.batteryMin = bat;
+                else if (bat > accumulator.batteryMax) accumulator.batteryMax = bat;
+                
+                // Update latest event time only if newer
+                if (value.eventTime.isAfter(accumulator.latestEventTime)) {
+                    accumulator.latestEventTime = value.eventTime;
+                    accumulator.latestStatus = value.status;
+                }
+            }
+
+            accumulator.count++;
+            accumulator.temperatureSum += value.temperature;
+            accumulator.humiditySum += value.humidity;
+            accumulator.pressureSum += value.pressure;
+            accumulator.batterySum += value.batteryLevel;
+            
+            return accumulator;
+        }
+
+        @Override
+        public SensorAccumulator merge(SensorAccumulator a, SensorAccumulator b) {
+            if (a.count == 0) {
+                return b;
+            }
+            if (b.count == 0) {
+                return a;
+            }
+
+            SensorAccumulator result = new SensorAccumulator();
+            result.sensorId = a.sensorId;
+            result.sensorType = a.sensorType;
+            result.location = a.location;
+            result.metadata = a.metadata;
+
+            result.count = a.count + b.count;
+
+            result.temperatureSum = a.temperatureSum + b.temperatureSum;
+            result.temperatureMin = Math.min(a.temperatureMin, b.temperatureMin);
+            result.temperatureMax = Math.max(a.temperatureMax, b.temperatureMax);
+
+            result.humiditySum = a.humiditySum + b.humiditySum;
+            result.humidityMin = Math.min(a.humidityMin, b.humidityMin);
+            result.humidityMax = Math.max(a.humidityMax, b.humidityMax);
+
+            result.pressureSum = a.pressureSum + b.pressureSum;
+            result.pressureMin = Math.min(a.pressureMin, b.pressureMin);
+            result.pressureMax = Math.max(a.pressureMax, b.pressureMax);
+
+            result.batterySum = a.batterySum + b.batterySum;
+            result.batteryMin = Math.min(a.batteryMin, b.batteryMin);
+            result.batteryMax = Math.max(a.batteryMax, b.batteryMax);
+
+            if (a.latestEventTime != null && b.latestEventTime != null) {
+                if (a.latestEventTime.isAfter(b.latestEventTime)) {
+                    result.latestEventTime = a.latestEventTime;
+                    result.latestStatus = a.latestStatus;
+                } else {
+                    result.latestEventTime = b.latestEventTime;
+                    result.latestStatus = b.latestStatus;
+                }
+            } else if (a.latestEventTime != null) {
+                result.latestEventTime = a.latestEventTime;
+                result.latestStatus = a.latestStatus;
+            } else {
+                result.latestEventTime = b.latestEventTime;
+                result.latestStatus = b.latestStatus;
+            }
+            return result;
+        }
+
+        @Override
+        public SensorAccumulator getResult(SensorAccumulator accumulator) {
+            return accumulator;
+        }
+    }
+
+    private static class WindowEnricher extends ProcessWindowFunction<
+            SensorAccumulator, SensorAggregate, String, TimeWindow> {
+        @Override
+        public void process(String key, Context context, Iterable<SensorAccumulator> elements, Collector<SensorAggregate> out) {
+            SensorAccumulator accumulator = elements.iterator().next();
+            if (accumulator.count == 0) {
+                return;
+            }
+            
+            // Optimized: Calculate averages once and reuse
+            long count = accumulator.count;
+            double invCount = 1.0 / count;  // Use multiplication instead of division
+            double avgTemp = accumulator.temperatureSum * invCount;
+            double avgHumidity = accumulator.humiditySum * invCount;
+            double avgPressure = accumulator.pressureSum * invCount;
+            double avgBattery = accumulator.batterySum * invCount;
+
+            // Create aggregate object directly (avoid intermediate variables)
+            SensorAggregate aggregate = new SensorAggregate(
+                    key,
+                    accumulator.sensorType,
+                    accumulator.location,
+                    context.window().getStart(),
+                    context.window().getEnd(),
+                    avgTemp,
+                    accumulator.temperatureMin,
+                    accumulator.temperatureMax,
+                    avgHumidity,
+                    accumulator.humidityMin,
+                    accumulator.humidityMax,
+                    avgPressure,
+                    accumulator.pressureMin,
+                    accumulator.pressureMax,
+                    avgBattery,
+                    accumulator.batteryMin,
+                    accumulator.batteryMax,
+                    accumulator.latestStatus,
+                    accumulator.latestEventTime == null ? 0L : accumulator.latestEventTime.toEpochMilli(),
+                    accumulator.metadata);
+            out.collect(aggregate);
+        }
+    }
+
+    private static final class SensorAccumulator implements java.io.Serializable {
+        private static final long serialVersionUID = 1L;
+        private String sensorId;
+        private String sensorType;
+        private String location;
+        private SensorData.MetaData metadata;
+        private long count = 0L;
+
+        private double temperatureSum = 0D;
+        private double temperatureMin = Double.POSITIVE_INFINITY;
+        private double temperatureMax = Double.NEGATIVE_INFINITY;
+
+        private double humiditySum = 0D;
+        private double humidityMin = Double.POSITIVE_INFINITY;
+        private double humidityMax = Double.NEGATIVE_INFINITY;
+
+        private double pressureSum = 0D;
+        private double pressureMin = Double.POSITIVE_INFINITY;
+        private double pressureMax = Double.NEGATIVE_INFINITY;
+
+        private double batterySum = 0D;
+        private double batteryMin = Double.POSITIVE_INFINITY;
+        private double batteryMax = Double.NEGATIVE_INFINITY;
+
+        private Instant latestEventTime;
+        private String latestStatus;
+    }
+
+    /**
+     * SensorRecord matching JDBCFlinkConsumer.java schema.
+     * Contains all fields needed for ClickHouse sink, with defaults added at sink level.
+     */
+    private static class SensorRecord implements java.io.Serializable {
+        private static final long serialVersionUID = 1L;
+        // Device identifiers
+        String device_id;
+        String device_type;
+        String customer_id;
+        String site_id;
+        
+        // Location data
+        double latitude;
+        double longitude;
+        double altitude;
+        
+        // Sensor readings
+        double temperature;
+        double humidity;
+        double pressure;
+        double co2_level;
+        double noise_level;
+        double light_level;
+        int motion_detected;
+        
+        // Device metrics
+        double battery_level;
+        double signal_strength;
+        double memory_usage;
+        double cpu_usage;
+        
+        // Status
+        int status;
+        int error_count;
+        
+        // Network metrics
+        long packets_sent;
+        long packets_received;
+        long bytes_sent;
+        long bytes_received;
+        
+        @Override
+        public String toString() {
+            return String.format(
+                    Locale.ROOT,
+                    "SensorRecord{device_id=%s, device_type=%s, customer_id=%s, site_id=%s, " +
+                    "temperature=%.2f, humidity=%.2f, pressure=%.2f, battery_level=%.2f, status=%d}",
+                    device_id, device_type, customer_id, site_id,
+                    temperature, humidity, pressure, battery_level, status);
+        }
+    }
+
+    private static class SensorAggregate implements java.io.Serializable {
+        private static final long serialVersionUID = 1L;
+        final String sensorId;
+        final String sensorType;
+        final String location;
+        final long windowStart;
+        final long windowEnd;
+        final double avgTemperature;
+        final double minTemperature;
+        final double maxTemperature;
+        final double avgHumidity;
+        final double minHumidity;
+        final double maxHumidity;
+        final double avgPressure;
+        final double minPressure;
+        final double maxPressure;
+        final double avgBatteryLevel;
+        final double minBatteryLevel;
+        final double maxBatteryLevel;
+        final String latestStatus;
+        final long latestEventTime;
+        final SensorData.MetaData metadata;
+
+        SensorAggregate(String sensorId, String sensorType, String location,
+                       long windowStart, long windowEnd,
+                       double avgTemperature, double minTemperature, double maxTemperature,
+                       double avgHumidity, double minHumidity, double maxHumidity,
+                       double avgPressure, double minPressure, double maxPressure,
+                       double avgBatteryLevel, double minBatteryLevel, double maxBatteryLevel,
+                       String latestStatus, long latestEventTime, SensorData.MetaData metadata) {
+            this.sensorId = sensorId;
+            this.sensorType = sensorType;
+            this.location = location;
+            this.windowStart = windowStart;
+            this.windowEnd = windowEnd;
+            this.avgTemperature = avgTemperature;
+            this.minTemperature = minTemperature;
+            this.maxTemperature = maxTemperature;
+            this.avgHumidity = avgHumidity;
+            this.minHumidity = minHumidity;
+            this.maxHumidity = maxHumidity;
+            this.avgPressure = avgPressure;
+            this.minPressure = minPressure;
+            this.maxPressure = maxPressure;
+            this.avgBatteryLevel = avgBatteryLevel;
+            this.minBatteryLevel = minBatteryLevel;
+            this.maxBatteryLevel = maxBatteryLevel;
+            this.latestStatus = latestStatus;
+            this.latestEventTime = latestEventTime;
+            this.metadata = metadata;
+        }
+
+        @Override
+        public String toString() {
+            return String.format(
+                    Locale.ROOT,
+                    "SensorAggregate{sensorId=%s, window=[%s,%s), avgTemp=%.2f, avgHumidity=%.2f, avgPressure=%.2f, avgBattery=%.2f, status=%s}",
+                    sensorId,
+                    Instant.ofEpochMilli(windowStart),
+                    Instant.ofEpochMilli(windowEnd),
+                    avgTemperature,
+                    avgHumidity,
+                    avgPressure,
+                    avgBatteryLevel,
+                    latestStatus);
+        }
+    }
+}
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/inspect/FlussMetadataInspector.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/inspect/FlussMetadataInspector.java
new file mode 100644
index 0000000..aa131ab
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/inspect/FlussMetadataInspector.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.inspect;
+
+import org.apache.fluss.client.Connection;
+import org.apache.fluss.client.ConnectionFactory;
+import org.apache.fluss.client.admin.Admin;
+import org.apache.fluss.config.ConfigOptions;
+import org.apache.fluss.config.Configuration;
+
+import java.util.Collections;
+import java.util.List;
+
+/** Simple CLI to inspect Fluss metadata (databases and tables). */
+public final class FlussMetadataInspector {
+
+    private FlussMetadataInspector() {}
+
+    public static void main(String[] args) throws Exception {
+        if (args.length == 0 || args.length > 2) {
+            System.err.println("Usage: FlussMetadataInspector <bootstrap-host:port> [database]");
+            System.exit(1);
+        }
+
+        String bootstrap = args[0];
+        String databaseFilter = args.length == 2 ? args[1] : null;
+
+        Configuration conf = new Configuration();
+        conf.set(ConfigOptions.BOOTSTRAP_SERVERS, Collections.singletonList(bootstrap));
+
+        try (Connection connection = ConnectionFactory.createConnection(conf);
+                Admin admin = connection.getAdmin()) {
+            List<String> databases = admin.listDatabases().get();
+            System.out.println("Databases:");
+            databases.forEach(db -> System.out.println("  - " + db));
+
+            if (!databases.isEmpty()) {
+                if (databaseFilter != null) {
+                    printTables(admin, databaseFilter);
+                } else {
+                    for (String db : databases) {
+                        printTables(admin, db);
+                    }
+                }
+            }
+        }
+    }
+
+    private static void printTables(Admin admin, String database) {
+        try {
+            List<String> tables = admin.listTables(database).get();
+            System.out.println("Tables in database '" + database + "':");
+            if (tables.isEmpty()) {
+                System.out.println("  (none)");
+            } else {
+                tables.forEach(table -> System.out.println("  - " + table));
+            }
+        } catch (Exception e) {
+            System.err.println(
+                    "Failed to list tables for database '" + database + "': " + e.getMessage());
+        }
+    }
+}
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/inspect/FlussPrimaryKeySnapshotPeek.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/inspect/FlussPrimaryKeySnapshotPeek.java
new file mode 100644
index 0000000..36338b2
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/inspect/FlussPrimaryKeySnapshotPeek.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.inspect;
+
+import org.apache.fluss.client.Connection;
+import org.apache.fluss.client.ConnectionFactory;
+import org.apache.fluss.client.table.Table;
+import org.apache.fluss.client.table.scanner.batch.BatchScanner;
+import org.apache.fluss.config.ConfigOptions;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.metadata.TableBucket;
+import org.apache.fluss.metadata.TableInfo;
+import org.apache.fluss.metadata.TablePath;
+import org.apache.fluss.row.InternalRow;
+import org.apache.fluss.utils.CloseableIterator;
+
+import java.io.IOException;
+import java.time.Duration;
+import java.util.Collections;
+
+/**
+ * Utility to read the current snapshot of a primary key table from Fluss.
+ */
+public final class FlussPrimaryKeySnapshotPeek {
+
+    private FlussPrimaryKeySnapshotPeek() {}
+
+    public static void main(String[] args) throws Exception {
+        if (args.length < 3 || args.length > 4) {
+            System.err.println(
+                    "Usage: FlussPrimaryKeySnapshotPeek <bootstrap-host:port> <database> <table> [limit]\n"
+                            + "Example: FlussPrimaryKeySnapshotPeek localhost:9123 iot sensor_readings 20");
+            System.exit(1);
+        }
+
+        String bootstrap = args[0];
+        String database = args[1];
+        String tableName = args[2];
+        int limit = args.length == 4 ? Integer.parseInt(args[3]) : 20;
+
+        Configuration conf = new Configuration();
+        conf.set(ConfigOptions.BOOTSTRAP_SERVERS, Collections.singletonList(bootstrap));
+
+        try (Connection connection = ConnectionFactory.createConnection(conf);
+                Table table = connection.getTable(TablePath.of(database, tableName))) {
+            TableInfo tableInfo = table.getTableInfo();
+
+            if (!tableInfo.hasPrimaryKey()) {
+                System.err.println("Table is not a primary-key table; snapshot peek is not supported.");
+                return;
+            }
+
+            if (tableInfo.isPartitioned()) {
+                System.err.println("Partitioned primary-key tables are not supported by this helper yet.");
+                return;
+            }
+
+            long tableId = tableInfo.getTableId();
+            int numBuckets = tableInfo.getNumBuckets();
+            System.out.printf(
+                    "Reading snapshot from %d buckets for table %s.%s (limit=%d)%n",
+                    numBuckets, database, tableName, limit);
+
+            int remaining = limit;
+            for (int bucket = 0; bucket < numBuckets && remaining > 0; bucket++) {
+                TableBucket tableBucket = new TableBucket(tableId, bucket);
+                try (BatchScanner scanner =
+                        table.newScan().limit(remaining).createBatchScanner(tableBucket)) {
+                    remaining = dumpBucket(scanner, bucket, remaining);
+                }
+            }
+
+            if (remaining == limit) {
+                System.out.println("(no rows found)");
+            } else if (remaining > 0) {
+                System.out.printf("Reached end of table after printing %d rows.%n", limit - remaining);
+            }
+        }
+    }
+
+    private static int dumpBucket(BatchScanner scanner, int bucket, int remaining) throws IOException {
+        while (remaining > 0) {
+            try (CloseableIterator<InternalRow> rows = scanner.pollBatch(Duration.ofMillis(500))) {
+                if (rows == null) {
+                    break;
+                }
+                while (rows.hasNext() && remaining > 0) {
+                    InternalRow row = rows.next();
+                    System.out.printf("bucket=%d %s%n", bucket, row);
+                    remaining--;
+                }
+            }
+        }
+        return remaining;
+    }
+}
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/inspect/FlussTableBucketChecker.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/inspect/FlussTableBucketChecker.java
new file mode 100644
index 0000000..1427a30
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/inspect/FlussTableBucketChecker.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.inspect;
+
+import org.apache.fluss.client.Connection;
+import org.apache.fluss.client.ConnectionFactory;
+import org.apache.fluss.client.admin.Admin;
+import org.apache.fluss.config.ConfigOptions;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.metadata.TablePath;
+
+import java.util.Collections;
+
+/**
+ * Simple CLI utility to check the number of buckets in a Fluss table.
+ * Uses Fluss Admin API to query table metadata directly.
+ */
+public final class FlussTableBucketChecker {
+
+    private FlussTableBucketChecker() {}
+
+    public static void main(String[] args) throws Exception {
+        if (args.length != 3) {
+            System.err.println("Usage: FlussTableBucketChecker <bootstrap-host:port> <database> <table>");
+            System.err.println("Example: FlussTableBucketChecker localhost:9124 iot sensor_readings");
+            System.exit(1);
+        }
+
+        String bootstrap = args[0];
+        String database = args[1];
+        String table = args[2];
+
+        Configuration conf = new Configuration();
+        conf.set(ConfigOptions.BOOTSTRAP_SERVERS, Collections.singletonList(bootstrap));
+
+        TablePath tablePath = TablePath.of(database, table);
+
+        try (Connection connection = ConnectionFactory.createConnection(conf);
+                Admin admin = connection.getAdmin()) {
+            
+            // Check if table exists
+            boolean tableExists = admin.tableExists(tablePath).get();
+            if (!tableExists) {
+                System.err.println("ERROR: Table '" + tablePath + "' does not exist");
+                System.exit(1);
+            }
+
+            // Get table info and bucket count
+            var tableInfo = admin.getTableInfo(tablePath).get();
+            int bucketCount = tableInfo.getNumBuckets();
+
+            // Output result
+            System.out.println("========================================");
+            System.out.println("Fluss Table Bucket Count Check");
+            System.out.println("========================================");
+            System.out.println("Bootstrap:  " + bootstrap);
+            System.out.println("Database:   " + database);
+            System.out.println("Table:      " + table);
+            System.out.println("Buckets:    " + bucketCount);
+            System.out.println("========================================");
+            
+            if (bucketCount == 48) {
+                System.out.println("✓ Table has 48 buckets as expected");
+                System.exit(0);
+            } else {
+                System.out.println("⚠ WARNING: Table has " + bucketCount + " buckets, expected 48");
+                System.exit(1);
+            }
+        } catch (Exception e) {
+            System.err.println("ERROR: Failed to check table bucket count");
+            System.err.println("Error: " + e.getMessage());
+            e.printStackTrace();
+            System.exit(1);
+        }
+    }
+}
+
+
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/inspect/FlussTableLogPeek.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/inspect/FlussTableLogPeek.java
new file mode 100644
index 0000000..2c3055f
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/inspect/FlussTableLogPeek.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.inspect;
+
+import org.apache.fluss.client.Connection;
+import org.apache.fluss.client.ConnectionFactory;
+import org.apache.fluss.client.table.Table;
+import org.apache.fluss.client.table.scanner.ScanRecord;
+import org.apache.fluss.client.table.scanner.log.LogScanner;
+import org.apache.fluss.client.table.scanner.log.ScanRecords;
+import org.apache.fluss.config.ConfigOptions;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.metadata.TableInfo;
+import org.apache.fluss.metadata.TablePath;
+
+import java.time.Duration;
+import java.util.Collections;
+
+/**
+ * Simple helper that prints a few records from the Fluss change log so we can confirm data flow.
+ */
+public final class FlussTableLogPeek {
+
+    private FlussTableLogPeek() {}
+
+    public static void main(String[] args) throws Exception {
+        if (args.length < 3 || args.length > 4) {
+            System.err.println(
+                    "Usage: FlussTableLogPeek <bootstrap-host:port> <database> <table> [limit]"
+                            + System.lineSeparator()
+                            + "Example: FlussTableLogPeek localhost:9123 iot sensor_readings 5");
+            System.exit(1);
+        }
+
+        String bootstrap = args[0];
+        String database = args[1];
+        String tableName = args[2];
+        int limit = args.length == 4 ? Integer.parseInt(args[3]) : 10;
+
+        Configuration conf = new Configuration();
+        conf.set(ConfigOptions.BOOTSTRAP_SERVERS, Collections.singletonList(bootstrap));
+
+        try (Connection connection = ConnectionFactory.createConnection(conf);
+                Table table = connection.getTable(TablePath.of(database, tableName))) {
+            TableInfo tableInfo = table.getTableInfo();
+            int buckets = tableInfo.getNumBuckets();
+            System.out.printf(
+                    "Subscribing to %d buckets for table %s.%s%n", buckets, database, tableName);
+
+            try (LogScanner scanner = table.newScan().createLogScanner()) {
+                for (int bucket = 0; bucket < buckets; bucket++) {
+                    scanner.subscribeFromBeginning(bucket);
+                }
+
+                int printed = 0;
+                int emptyPolls = 0;
+                while (printed < limit && emptyPolls < 5) {
+                    ScanRecords records = scanner.poll(Duration.ofSeconds(1));
+                    if (records.isEmpty()) {
+                        emptyPolls++;
+                        continue;
+                    }
+                    for (ScanRecord record : records) {
+                        System.out.println(record);
+                        printed++;
+                        if (printed >= limit) {
+                            break;
+                        }
+                    }
+                }
+
+                if (printed == 0) {
+                    System.out.println("No records found (table might be empty or producer not running).");
+                } else if (printed < limit) {
+                    System.out.printf("Displayed %d records (no more records available now).%n", printed);
+                }
+            }
+        }
+    }
+}
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/model/SensorData.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/model/SensorData.java
new file mode 100644
index 0000000..0c413c9
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/model/SensorData.java
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.model;
+
+import java.io.Serializable;
+import java.time.Instant;
+import java.util.Objects;
+
+/**
+ * Sensor data record used in the demo. This mirrors the schema that was used in the
+ * RealtimeDataPlatform example (flattened metadata for simplicity).
+ */
+public class SensorData implements Serializable {
+    private String sensorId;
+    private String sensorType;
+    private String location;
+    private double temperature;
+    private double humidity;
+    private double pressure;
+    private double batteryLevel;
+    private String status;
+    private Instant eventTime;
+    private MetaData metadata;
+
+    public SensorData() {}
+
+    public SensorData(
+            String sensorId,
+            String sensorType,
+            String location,
+            double temperature,
+            double humidity,
+            double pressure,
+            double batteryLevel,
+            String status,
+            Instant eventTime,
+            MetaData metadata) {
+        this.sensorId = sensorId;
+        this.sensorType = sensorType;
+        this.location = location;
+        this.temperature = temperature;
+        this.humidity = humidity;
+        this.pressure = pressure;
+        this.batteryLevel = batteryLevel;
+        this.status = status;
+        this.eventTime = eventTime;
+        this.metadata = metadata;
+    }
+
+    public String getSensorId() {
+        return sensorId;
+    }
+
+    public void setSensorId(String sensorId) {
+        this.sensorId = sensorId;
+    }
+
+    public String getSensorType() {
+        return sensorType;
+    }
+
+    public void setSensorType(String sensorType) {
+        this.sensorType = sensorType;
+    }
+
+    public String getLocation() {
+        return location;
+    }
+
+    public void setLocation(String location) {
+        this.location = location;
+    }
+
+    public double getTemperature() {
+        return temperature;
+    }
+
+    public void setTemperature(double temperature) {
+        this.temperature = temperature;
+    }
+
+    public double getHumidity() {
+        return humidity;
+    }
+
+    public void setHumidity(double humidity) {
+        this.humidity = humidity;
+    }
+
+    public double getPressure() {
+        return pressure;
+    }
+
+    public void setPressure(double pressure) {
+        this.pressure = pressure;
+    }
+
+    public double getBatteryLevel() {
+        return batteryLevel;
+    }
+
+    public void setBatteryLevel(double batteryLevel) {
+        this.batteryLevel = batteryLevel;
+    }
+
+    public String getStatus() {
+        return status;
+    }
+
+    public void setStatus(String status) {
+        this.status = status;
+    }
+
+    public Instant getEventTime() {
+        return eventTime;
+    }
+
+    public void setEventTime(Instant eventTime) {
+        this.eventTime = eventTime;
+    }
+
+    public MetaData getMetadata() {
+        return metadata;
+    }
+
+    public void setMetadata(MetaData metadata) {
+        this.metadata = metadata;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (!(o instanceof SensorData)) {
+            return false;
+        }
+        SensorData that = (SensorData) o;
+        return Double.compare(that.temperature, temperature) == 0
+                && Double.compare(that.humidity, humidity) == 0
+                && Double.compare(that.pressure, pressure) == 0
+                && Double.compare(that.batteryLevel, batteryLevel) == 0
+                && Objects.equals(sensorId, that.sensorId)
+                && Objects.equals(sensorType, that.sensorType)
+                && Objects.equals(location, that.location)
+                && Objects.equals(status, that.status)
+                && Objects.equals(eventTime, that.eventTime)
+                && Objects.equals(metadata, that.metadata);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(
+                sensorId,
+                sensorType,
+                location,
+                temperature,
+                humidity,
+                pressure,
+                batteryLevel,
+                status,
+                eventTime,
+                metadata);
+    }
+
+    @Override
+    public String toString() {
+        return "SensorData{"
+                + "sensorId='"
+                + sensorId
+                + '\''
+                + ", sensorType='"
+                + sensorType
+                + '\''
+                + ", location='"
+                + location
+                + '\''
+                + ", temperature="
+                + temperature
+                + ", humidity="
+                + humidity
+                + ", pressure="
+                + pressure
+                + ", batteryLevel="
+                + batteryLevel
+                + ", status='"
+                + status
+                + '\''
+                + ", eventTime="
+                + eventTime
+                + ", metadata="
+                + metadata
+                + '}';
+    }
+
+    /** Nested metadata payload. */
+    public static class MetaData implements Serializable {
+        private String manufacturer;
+        private String model;
+        private String firmwareVersion;
+        private double latitude;
+        private double longitude;
+
+        public MetaData() {}
+
+        public MetaData(
+                String manufacturer,
+                String model,
+                String firmwareVersion,
+                double latitude,
+                double longitude) {
+            this.manufacturer = manufacturer;
+            this.model = model;
+            this.firmwareVersion = firmwareVersion;
+            this.latitude = latitude;
+            this.longitude = longitude;
+        }
+
+        public String getManufacturer() {
+            return manufacturer;
+        }
+
+        public void setManufacturer(String manufacturer) {
+            this.manufacturer = manufacturer;
+        }
+
+        public String getModel() {
+            return model;
+        }
+
+        public void setModel(String model) {
+            this.model = model;
+        }
+
+        public String getFirmwareVersion() {
+            return firmwareVersion;
+        }
+
+        public void setFirmwareVersion(String firmwareVersion) {
+            this.firmwareVersion = firmwareVersion;
+        }
+
+        public double getLatitude() {
+            return latitude;
+        }
+
+        public void setLatitude(double latitude) {
+            this.latitude = latitude;
+        }
+
+        public double getLongitude() {
+            return longitude;
+        }
+
+        public void setLongitude(double longitude) {
+            this.longitude = longitude;
+        }
+
+        @Override
+        public String toString() {
+            return "MetaData{"
+                    + "manufacturer='"
+                    + manufacturer
+                    + '\''
+                    + ", model='"
+                    + model
+                    + '\''
+                    + ", firmwareVersion='"
+                    + firmwareVersion
+                    + '\''
+                    + ", latitude="
+                    + latitude
+                    + ", longitude="
+                    + longitude
+                    + '}';
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) {
+                return true;
+            }
+            if (!(o instanceof MetaData)) {
+                return false;
+            }
+            MetaData metaData = (MetaData) o;
+            return Double.compare(metaData.latitude, latitude) == 0
+                    && Double.compare(metaData.longitude, longitude) == 0
+                    && Objects.equals(manufacturer, metaData.manufacturer)
+                    && Objects.equals(model, metaData.model)
+                    && Objects.equals(firmwareVersion, metaData.firmwareVersion);
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(manufacturer, model, firmwareVersion, latitude, longitude);
+        }
+    }
+}
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/model/SensorDataMinimal.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/model/SensorDataMinimal.java
new file mode 100644
index 0000000..251adbc
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/model/SensorDataMinimal.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.model;
+
+import java.io.Serializable;
+
+/**
+ * Sensor Data Model matching minimal schema from JDBCFlinkConsumer.java
+ * This matches the minimal schema used in RealtimeDataPlatform:
+ * - sensorId (int)
+ * - sensorType (int) - 1=temperature, 2=humidity, 3=pressure, 4=motion, 5=light, 6=co2, 7=noise, 8=multisensor
+ * - temperature (double)
+ * - humidity (double)
+ * - pressure (double)
+ * - batteryLevel (double)
+ * - status (int) - 1=online, 2=offline, 3=maintenance, 4=error
+ * - timestamp (long) - milliseconds since epoch
+ */
+public class SensorDataMinimal implements Serializable {
+    private int sensorId;
+    private int sensorType;
+    private double temperature;
+    private double humidity;
+    private double pressure;
+    private double batteryLevel;
+    private int status;
+    private long timestamp;
+
+    public SensorDataMinimal() {}
+
+    public SensorDataMinimal(int sensorId, int sensorType, double temperature, double humidity,
+                         double pressure, double batteryLevel, int status, long timestamp) {
+        this.sensorId = sensorId;
+        this.sensorType = sensorType;
+        this.temperature = temperature;
+        this.humidity = humidity;
+        this.pressure = pressure;
+        this.batteryLevel = batteryLevel;
+        this.status = status;
+        this.timestamp = timestamp;
+    }
+
+    // Getters and Setters
+    public int getSensorId() { return sensorId; }
+    public void setSensorId(int sensorId) { this.sensorId = sensorId; }
+    
+    public int getSensorType() { return sensorType; }
+    public void setSensorType(int sensorType) { this.sensorType = sensorType; }
+    
+    public double getTemperature() { return temperature; }
+    public void setTemperature(double temperature) { this.temperature = temperature; }
+    
+    public double getHumidity() { return humidity; }
+    public void setHumidity(double humidity) { this.humidity = humidity; }
+    
+    public double getPressure() { return pressure; }
+    public void setPressure(double pressure) { this.pressure = pressure; }
+    
+    public double getBatteryLevel() { return batteryLevel; }
+    public void setBatteryLevel(double batteryLevel) { this.batteryLevel = batteryLevel; }
+    
+    public int getStatus() { return status; }
+    public void setStatus(int status) { this.status = status; }
+    
+    public long getTimestamp() { return timestamp; }
+    public void setTimestamp(long timestamp) { this.timestamp = timestamp; }
+}
+
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/model/SensorDataRealtimePlatform.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/model/SensorDataRealtimePlatform.java
new file mode 100644
index 0000000..7528e7c
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/model/SensorDataRealtimePlatform.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.model;
+
+import java.io.Serializable;
+import java.time.Instant;
+
+/**
+ * Sensor Data Model matching RealtimeDataPlatform benchmark.sensors_local schema.
+ * This model matches the schema from /Users/vijayabhaskarv/IOT/github/new/RealtimeDataPlatform/realtime-platform-1million-events/producer-load/
+ * 
+ * Only essential fields are stored in Fluss table. Remaining fields are set to default values at the sink.
+ */
+public class SensorDataRealtimePlatform implements Serializable {
+    // Device identifiers
+    private String deviceId;
+    private String deviceType;
+    private String customerId;
+    private String siteId;
+    
+    // Location data
+    private double latitude;
+    private double longitude;
+    private float altitude;  // Default value at sink: 0.0
+    
+    // Timestamp
+    private Instant time;
+    
+    // Sensor readings (stored in Fluss)
+    private float temperature;
+    private float humidity;
+    private float pressure;
+    
+    // Additional sensor readings (default values at sink)
+    private float co2Level;  // Default: 0.0
+    private float noiseLevel;  // Default: 0.0
+    private float lightLevel;  // Default: 0.0
+    private int motionDetected;  // Default: 0
+    
+    // Device metrics (stored in Fluss: battery_level only)
+    private float batteryLevel;
+    private float signalStrength;  // Default: 0.0
+    private float memoryUsage;  // Default: 0.0
+    private float cpuUsage;  // Default: 0.0
+    
+    // Status (stored in Fluss as INT: 1=online, 2=offline, 3=maintenance, 4=error)
+    private int status;
+    private long errorCount;  // Default: 0L
+    
+    // Network metrics (default values at sink)
+    private long packetsSent;  // Default: 0L
+    private long packetsReceived;  // Default: 0L
+    private long bytesSent;  // Default: 0L
+    private long bytesReceived;  // Default: 0L
+
+    public SensorDataRealtimePlatform() {}
+
+    // Getters and Setters
+    public String getDeviceId() { return deviceId; }
+    public void setDeviceId(String deviceId) { this.deviceId = deviceId; }
+    
+    public String getDeviceType() { return deviceType; }
+    public void setDeviceType(String deviceType) { this.deviceType = deviceType; }
+    
+    public String getCustomerId() { return customerId; }
+    public void setCustomerId(String customerId) { this.customerId = customerId; }
+    
+    public String getSiteId() { return siteId; }
+    public void setSiteId(String siteId) { this.siteId = siteId; }
+    
+    public double getLatitude() { return latitude; }
+    public void setLatitude(double latitude) { this.latitude = latitude; }
+    
+    public double getLongitude() { return longitude; }
+    public void setLongitude(double longitude) { this.longitude = longitude; }
+    
+    public float getAltitude() { return altitude; }
+    public void setAltitude(float altitude) { this.altitude = altitude; }
+    
+    public Instant getTime() { return time; }
+    public void setTime(Instant time) { this.time = time; }
+    
+    public float getTemperature() { return temperature; }
+    public void setTemperature(float temperature) { this.temperature = temperature; }
+    
+    public float getHumidity() { return humidity; }
+    public void setHumidity(float humidity) { this.humidity = humidity; }
+    
+    public float getPressure() { return pressure; }
+    public void setPressure(float pressure) { this.pressure = pressure; }
+    
+    public float getCo2Level() { return co2Level; }
+    public void setCo2Level(float co2Level) { this.co2Level = co2Level; }
+    
+    public float getNoiseLevel() { return noiseLevel; }
+    public void setNoiseLevel(float noiseLevel) { this.noiseLevel = noiseLevel; }
+    
+    public float getLightLevel() { return lightLevel; }
+    public void setLightLevel(float lightLevel) { this.lightLevel = lightLevel; }
+    
+    public int getMotionDetected() { return motionDetected; }
+    public void setMotionDetected(int motionDetected) { this.motionDetected = motionDetected; }
+    
+    public float getBatteryLevel() { return batteryLevel; }
+    public void setBatteryLevel(float batteryLevel) { this.batteryLevel = batteryLevel; }
+    
+    public float getSignalStrength() { return signalStrength; }
+    public void setSignalStrength(float signalStrength) { this.signalStrength = signalStrength; }
+    
+    public float getMemoryUsage() { return memoryUsage; }
+    public void setMemoryUsage(float memoryUsage) { this.memoryUsage = memoryUsage; }
+    
+    public float getCpuUsage() { return cpuUsage; }
+    public void setCpuUsage(float cpuUsage) { this.cpuUsage = cpuUsage; }
+    
+    public int getStatus() { return status; }
+    public void setStatus(int status) { this.status = status; }
+    
+    public long getErrorCount() { return errorCount; }
+    public void setErrorCount(long errorCount) { this.errorCount = errorCount; }
+    
+    public long getPacketsSent() { return packetsSent; }
+    public void setPacketsSent(long packetsSent) { this.packetsSent = packetsSent; }
+    
+    public long getPacketsReceived() { return packetsReceived; }
+    public void setPacketsReceived(long packetsReceived) { this.packetsReceived = packetsReceived; }
+    
+    public long getBytesSent() { return bytesSent; }
+    public void setBytesSent(long bytesSent) { this.bytesSent = bytesSent; }
+    
+    public long getBytesReceived() { return bytesReceived; }
+    public void setBytesReceived(long bytesReceived) { this.bytesReceived = bytesReceived; }
+}
+
+
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/producer/FlussSensorProducerApp.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/producer/FlussSensorProducerApp.java
new file mode 100644
index 0000000..413ed38
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/producer/FlussSensorProducerApp.java
@@ -0,0 +1,431 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.producer;
+
+import org.apache.fluss.benchmark.e2eplatformaws.model.SensorData;
+import org.apache.fluss.benchmark.e2eplatformaws.model.SensorData.MetaData;
+import org.apache.fluss.client.Connection;
+import org.apache.fluss.client.ConnectionFactory;
+import org.apache.fluss.client.admin.Admin;
+import org.apache.fluss.client.table.Table;
+import org.apache.fluss.client.table.writer.UpsertWriter;
+import org.apache.fluss.config.ConfigOptions;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.config.MemorySize;
+import org.apache.fluss.metadata.DatabaseDescriptor;
+import org.apache.fluss.metadata.Schema;
+import org.apache.fluss.metadata.TableDescriptor;
+import org.apache.fluss.metadata.TablePath;
+import org.apache.fluss.row.BinaryString;
+import org.apache.fluss.row.GenericRow;
+import org.apache.fluss.row.TimestampLtz;
+import org.apache.fluss.types.DataTypes;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.time.Duration;
+import java.time.Instant;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+/**
+ * Simple Fluss producer that continuously writes randomly generated sensor data into a primary key
+ * table. The schema mirrors the IoT pipeline that previously used Pulsar.
+ */
+public final class FlussSensorProducerApp {
+    private static final Logger LOG = LoggerFactory.getLogger(FlussSensorProducerApp.class);
+
+    // Set IPv4-only properties in static initializer to ensure they're set before any class loading
+    static {
+        System.setProperty("java.net.preferIPv4Stack", "true");
+        System.setProperty("java.net.preferIPv4Addresses", "true");
+        // Disable IPv6 completely
+        System.setProperty("java.net.useSystemProxies", "false");
+    }
+
+    public static void main(String[] args) throws Exception {
+        // Ensure IPv4 properties are set (redundant but safe)
+        System.setProperty("java.net.preferIPv4Stack", "true");
+        System.setProperty("java.net.preferIPv4Addresses", "true");
+        
+        ProducerOptions options = ProducerOptions.parse(args);
+        LOG.info("Starting Fluss producer with options: {}", options);
+
+        Configuration flussConf = new Configuration();
+        flussConf.set(ConfigOptions.BOOTSTRAP_SERVERS, Collections.singletonList(options.bootstrap));
+        flussConf.set(ConfigOptions.CLIENT_WRITER_BUFFER_MEMORY_SIZE, options.writerBufferSize);
+        flussConf.set(ConfigOptions.CLIENT_WRITER_BATCH_SIZE, options.writerBatchSize);
+        
+        // Start Prometheus metrics server
+        ProducerMetrics metrics = new ProducerMetrics(8080);
+        metrics.start();
+        LOG.info("Prometheus metrics server started on port 8080");
+
+        TablePath tablePath = TablePath.of(options.database, options.table);
+
+        try (Connection connection = ConnectionFactory.createConnection(flussConf)) {
+            ensureSchema(connection, tablePath, options.bucketCount);
+
+            try (Table table = connection.getTable(tablePath)) {
+                UpsertWriter writer = table.newUpsert().createWriter();
+                AtomicBoolean running = new AtomicBoolean(true);
+                Runtime.getRuntime()
+                        .addShutdownHook(new Thread(() -> shutdown(writer, running), "fluss-producer-shutdown"));
+
+                RandomSensorDataGenerator generator =
+                        new RandomSensorDataGenerator(options.sensorPoolSize, options.statusValues);
+
+                long sent = 0;
+                long startNano = System.nanoTime();
+                long lastStatsNano = startNano;
+                long nanosPerRecord = options.recordsPerSecond > 0
+                        ? TimeUnit.SECONDS.toNanos(1) / options.recordsPerSecond
+                        : 0;
+                long stopAtCount = options.totalRecords > 0 ? options.totalRecords : Long.MAX_VALUE;
+                long stopAtTime = options.runDuration.isZero()
+                        ? Long.MAX_VALUE
+                        : System.nanoTime() + options.runDuration.toNanos();
+
+                while (running.get() && sent < stopAtCount && System.nanoTime() < stopAtTime) {
+                    SensorData record = generator.next();
+                    writeToFluss(writer, record);
+                    sent++;
+                    metrics.recordWrite(); // Record metric
+
+                    if (sent % options.flushEvery == 0) {
+                        writer.flush();
+                        LOG.debug("Flushed {} records", sent);
+                    }
+
+                    if (sent % options.statsEvery == 0) {
+                        long now = System.nanoTime();
+                        double overallRate = ratePerSecond(sent, now - startNano);
+                        double windowRate = ratePerSecond(options.statsEvery, now - lastStatsNano);
+                        LOG.info(
+                                "Produced {} records (overall ~{} rec/s, last window ~{} rec/s)",
+                                sent,
+                                String.format(Locale.ROOT, "%.0f", overallRate),
+                                String.format(Locale.ROOT, "%.0f", windowRate));
+                        lastStatsNano = now;
+                        metrics.updateStats(sent); // Update metrics stats
+                    }
+
+                    if (nanosPerRecord > 0) {
+                        long target = startNano + sent * nanosPerRecord;
+                        long wait = target - System.nanoTime();
+                        if (wait > 0) {
+                            TimeUnit.NANOSECONDS.sleep(wait);
+                        }
+                    }
+                }
+
+                writer.flush();
+                LOG.info("Producer stopped after emitting {} records", sent);
+            }
+        } finally {
+            metrics.stop();
+        }
+    }
+
+    private static void shutdown(UpsertWriter writer, AtomicBoolean running) {
+        LOG.info("Shutdown requested – flushing pending records");
+        running.set(false);
+        try {
+            writer.flush();
+        } catch (Exception e) {
+            LOG.warn("Unable to flush writer during shutdown", e);
+        }
+    }
+
+    private static void ensureSchema(Connection connection, TablePath tablePath, int bucketCount)
+            throws Exception {
+        try (Admin admin = connection.getAdmin()) {
+            admin.createDatabase(
+                            tablePath.getDatabaseName(),
+                            DatabaseDescriptor.builder().comment("IoT demo database").build(),
+                            true)
+                    .get();
+
+            Schema schema = Schema.newBuilder()
+                    .primaryKey("sensor_id")
+                    .column("sensor_id", DataTypes.STRING())
+                    .column("sensor_type", DataTypes.STRING())
+                    .column("location", DataTypes.STRING())
+                    .column("temperature", DataTypes.DOUBLE())
+                    .column("humidity", DataTypes.DOUBLE())
+                    .column("pressure", DataTypes.DOUBLE())
+                    .column("battery_level", DataTypes.DOUBLE())
+                    .column("status", DataTypes.STRING())
+                    .column("event_time", DataTypes.TIMESTAMP_LTZ())
+                    .column("manufacturer", DataTypes.STRING())
+                    .column("model", DataTypes.STRING())
+                    .column("firmware_version", DataTypes.STRING())
+                    .column("latitude", DataTypes.DOUBLE())
+                    .column("longitude", DataTypes.DOUBLE())
+                    .build();
+
+            TableDescriptor descriptor = TableDescriptor.builder()
+                    .schema(schema)
+                    .comment("Realtime sensor readings")
+                    .distributedBy(bucketCount, "sensor_id")
+                    .build();
+
+            admin.createTable(tablePath, descriptor, true).get();
+            LOG.info("Ensured Fluss table {} exists ({} buckets)", tablePath, bucketCount);
+        }
+    }
+
+    private static void writeToFluss(UpsertWriter writer, SensorData data) throws Exception {
+        GenericRow row = new GenericRow(14);
+        row.setField(0, BinaryString.fromString(data.getSensorId()));
+        row.setField(1, BinaryString.fromString(nonNull(data.getSensorType())));
+        row.setField(2, BinaryString.fromString(nonNull(data.getLocation())));
+        row.setField(3, data.getTemperature());
+        row.setField(4, data.getHumidity());
+        row.setField(5, data.getPressure());
+        row.setField(6, data.getBatteryLevel());
+        row.setField(7, BinaryString.fromString(nonNull(data.getStatus())));
+        row.setField(8, TimestampLtz.fromInstant(data.getEventTime()));
+
+        MetaData meta = data.getMetadata();
+        if (meta != null) {
+            row.setField(9, BinaryString.fromString(nonNull(meta.getManufacturer())));
+            row.setField(10, BinaryString.fromString(nonNull(meta.getModel())));
+            row.setField(11, BinaryString.fromString(nonNull(meta.getFirmwareVersion())));
+            row.setField(12, meta.getLatitude());
+            row.setField(13, meta.getLongitude());
+        } else {
+            row.setField(9, null);
+            row.setField(10, null);
+            row.setField(11, null);
+            row.setField(12, null);
+            row.setField(13, null);
+        }
+
+        writer.upsert(row);
+    }
+
+    private static String nonNull(String value) {
+        return value == null ? "" : value;
+    }
+
+    private record ProducerOptions(
+            String bootstrap,
+            String database,
+            String table,
+            int bucketCount,
+            int sensorPoolSize,
+            long totalRecords,
+            Duration runDuration,
+            int recordsPerSecond,
+            int flushEvery,
+            MemorySize writerBufferSize,
+            MemorySize writerBatchSize,
+            int statsEvery,
+            List<String> statusValues) {
+        private static ProducerOptions parse(String[] args) {
+            String bootstrap = "localhost:9124";
+            String database = "iot";
+            String table = "sensor_readings";
+            int bucketCount = 12;
+            int sensorPool = 10_000;
+            long totalRecords = 0L;
+            Duration runDuration = Duration.ZERO;
+            // Check environment variable first, then use default
+            int recordsPerSecond = getIntEnv("PRODUCER_RATE", 5000);
+            int flushEvery = getIntEnv("PRODUCER_FLUSH_EVERY", 5000);
+            int statsEvery = getIntEnv("PRODUCER_STATS_EVERY", 50_000);
+            // Read buffer and batch sizes from environment variables
+            String bufferSizeStr = getEnv("CLIENT_WRITER_BUFFER_MEMORY_SIZE", "32mb");
+            String batchSizeStr = getEnv("CLIENT_WRITER_BATCH_SIZE", "4mb");
+            MemorySize bufferSize = MemorySize.parse(bufferSizeStr);
+            MemorySize batchSize = MemorySize.parse(batchSizeStr);
+            List<String> statuses = List.of("ONLINE", "OFFLINE", "MAINTENANCE", "DEGRADED");
+
+            for (int i = 0; i < args.length; i++) {
+                String option = args[i];
+                String inlineValue = null;
+                int eqIdx = option.indexOf('=');
+                if (eqIdx > 0) {
+                    inlineValue = option.substring(eqIdx + 1);
+                    option = option.substring(0, eqIdx);
+                }
+
+                switch (option) {
+                    case "--bootstrap":
+                        bootstrap = inlineValue != null ? inlineValue : requireValue(option, args, ++i);
+                        break;
+                    case "--database":
+                        database = inlineValue != null ? inlineValue : requireValue(option, args, ++i);
+                        break;
+                    case "--table":
+                        table = inlineValue != null ? inlineValue : requireValue(option, args, ++i);
+                        break;
+                    case "--buckets":
+                        bucketCount = Integer.parseInt(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    case "--sensors":
+                        sensorPool = Integer.parseInt(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    case "--count":
+                        totalRecords = Long.parseLong(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    case "--duration":
+                        String durationValue = inlineValue != null ? inlineValue : requireValue(option, args, ++i);
+                        runDuration = Duration.parse("PT" + durationValue.toUpperCase(Locale.ROOT));
+                        break;
+                    case "--rate":
+                        // Command-line argument overrides environment variable
+                        recordsPerSecond = Integer.parseInt(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    case "--flush":
+                        // Command-line argument overrides environment variable
+                        flushEvery = Integer.parseInt(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    case "--stats":
+                        // Command-line argument overrides environment variable
+                        statsEvery = Integer.parseInt(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    default:
+                        throw new IllegalArgumentException("Unknown argument: " + option);
+                }
+            }
+
+            return new ProducerOptions(
+                    bootstrap,
+                    database,
+                    table,
+                    bucketCount,
+                    sensorPool,
+                    totalRecords,
+                    runDuration,
+                    recordsPerSecond,
+                    flushEvery,
+                    bufferSize,
+                    batchSize,
+                    statsEvery,
+                    statuses);
+        }
+
+        private static String requireValue(String option, String[] args, int index) {
+            if (index >= args.length) {
+                throw new IllegalArgumentException("Missing value for " + option);
+            }
+            return args[index];
+        }
+    }
+
+    /**
+     * Get integer value from environment variable, or return default if not set or invalid.
+     */
+    private static int getIntEnv(String envVar, int defaultValue) {
+        String value = System.getenv(envVar);
+        if (value == null || value.trim().isEmpty()) {
+            return defaultValue;
+        }
+        try {
+            return Integer.parseInt(value.trim());
+        } catch (NumberFormatException e) {
+            LOG.warn("Invalid value for environment variable {}: {}, using default: {}", envVar, value, defaultValue);
+            return defaultValue;
+        }
+    }
+
+    /**
+     * Get string value from environment variable, or return default if not set.
+     */
+    private static String getEnv(String envVar, String defaultValue) {
+        String value = System.getenv(envVar);
+        if (value == null || value.trim().isEmpty()) {
+            return defaultValue;
+        }
+        return value.trim();
+    }
+
+    private static double ratePerSecond(long records, long elapsedNanos) {
+        if (elapsedNanos <= 0) {
+            return 0d;
+        }
+        return records / (elapsedNanos / 1_000_000_000d);
+    }
+
+    private static final class RandomSensorDataGenerator {
+        private final List<String> sensorIds;
+        private final List<String> sensorTypes = List.of(
+                "temperature_sensor",
+                "humidity_sensor",
+                "pressure_sensor",
+                "motion_sensor",
+                "light_sensor",
+                "co2_sensor",
+                "noise_sensor",
+                "multisensor");
+        private final List<String> locations = List.of("site-nyc-1", "site-sfo-2", "site-lon-1", "site-sin-3");
+        private final List<String> manufacturers = List.of("AcmeSensors", "FluxTech", "IoTica", "HyperLoop");
+        private final List<String> models = List.of("X100", "A12", "S9", "M5");
+        private final List<String> firmwareVersions = List.of("1.0.0", "1.1.3", "2.0.1");
+        private final List<String> statusValues;
+        private final Random random = new Random();
+
+        private RandomSensorDataGenerator(int poolSize, List<String> statusValues) {
+            this.sensorIds = new ArrayList<>(poolSize);
+            this.statusValues = statusValues;
+            for (int i = 0; i < poolSize; i++) {
+                sensorIds.add(String.format(Locale.ROOT, "sensor-%06d", i));
+            }
+        }
+
+        private SensorData next() {
+            String sensorId = sensorIds.get(random.nextInt(sensorIds.size()));
+            String sensorType = sensorTypes.get(random.nextInt(sensorTypes.size()));
+            String location = locations.get(random.nextInt(locations.size()));
+            double temperature = 10 + random.nextDouble() * 20;
+            double humidity = 30 + random.nextDouble() * 50;
+            double pressure = 990 + random.nextDouble() * 40;
+            double batteryLevel = 20 + random.nextDouble() * 80;
+            String status = statusValues.get(random.nextInt(statusValues.size()));
+            Instant eventTime = Instant.now();
+
+            double latitude = 30 + random.nextDouble() * 20;
+            double longitude = -120 + random.nextDouble() * 60;
+            MetaData meta = new MetaData(
+                    manufacturers.get(random.nextInt(manufacturers.size())),
+                    models.get(random.nextInt(models.size())),
+                    firmwareVersions.get(random.nextInt(firmwareVersions.size())),
+                    latitude,
+                    longitude);
+
+            return new SensorData(
+                    sensorId,
+                    sensorType,
+                    location,
+                    temperature,
+                    humidity,
+                    pressure,
+                    batteryLevel,
+                    status,
+                    eventTime,
+                    meta);
+        }
+    }
+}
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/producer/FlussSensorProducerAppMultiInstance.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/producer/FlussSensorProducerAppMultiInstance.java
new file mode 100644
index 0000000..800704d
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/producer/FlussSensorProducerAppMultiInstance.java
@@ -0,0 +1,545 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.producer;
+
+import org.apache.fluss.benchmark.e2eplatformaws.model.SensorDataMinimal;
+import org.apache.fluss.client.Connection;
+import org.apache.fluss.client.ConnectionFactory;
+import org.apache.fluss.client.admin.Admin;
+import org.apache.fluss.client.table.Table;
+import org.apache.fluss.client.table.writer.UpsertWriter;
+import org.apache.fluss.config.ConfigOptions;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.config.MemorySize;
+import org.apache.fluss.metadata.DatabaseDescriptor;
+import org.apache.fluss.metadata.Schema;
+import org.apache.fluss.metadata.TableDescriptor;
+import org.apache.fluss.metadata.TablePath;
+import org.apache.fluss.row.GenericRow;
+import org.apache.fluss.types.DataTypes;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.Random;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.LongAdder;
+
+/**
+ * Multi-instance Fluss producer that supports distributed device generation across multiple producer instances.
+ * 
+ * Features:
+ * - Supports 100,000 devices total, distributed across multiple producer instances
+ * - Each instance handles a specific device ID range based on instance-id and total-producers
+ * - Multiple threads write in parallel to maximize throughput
+ * - Each device has its own generator with independent state
+ * - Device IDs are hashed to 48 buckets automatically by Fluss (via sensor_id hash)
+ * - Rate is distributed evenly across all devices in the instance's range
+ */
+public final class FlussSensorProducerAppMultiInstance {
+    private static final Logger LOG = LoggerFactory.getLogger(FlussSensorProducerAppMultiInstance.class);
+    
+    // Total number of devices across all producer instances
+    private static final int TOTAL_DEVICES = 100_000;
+
+    // Set IPv4-only properties in static initializer to ensure they're set before any class loading
+    static {
+        System.setProperty("java.net.preferIPv4Stack", "true");
+        System.setProperty("java.net.preferIPv4Addresses", "true");
+        System.setProperty("java.net.useSystemProxies", "false");
+    }
+
+    public static void main(String[] args) throws Exception {
+        // Ensure IPv4 properties are set (redundant but safe)
+        System.setProperty("java.net.preferIPv4Stack", "true");
+        System.setProperty("java.net.preferIPv4Addresses", "true");
+        
+        ProducerOptions options = ProducerOptions.parse(args);
+        LOG.info("Starting multi-instance Fluss producer with options: {}", options);
+        
+        // Validate instance configuration
+        if (options.totalProducers <= 0) {
+            throw new IllegalArgumentException("total-producers must be > 0");
+        }
+        if (options.instanceId < 0 || options.instanceId >= options.totalProducers) {
+            throw new IllegalArgumentException(
+                    String.format("instance-id must be >= 0 and < total-producers (%d)", options.totalProducers));
+        }
+
+        // Calculate device ID range for this instance
+        int devicesPerInstance = TOTAL_DEVICES / options.totalProducers;
+        int startDeviceId = options.instanceId * devicesPerInstance;
+        int endDeviceId = (options.instanceId == options.totalProducers - 1) 
+                ? TOTAL_DEVICES  // Last instance gets any remainder
+                : (options.instanceId + 1) * devicesPerInstance;
+        int deviceCount = endDeviceId - startDeviceId;
+        
+        LOG.info("Instance {} of {}: Handling devices {} to {} ({} devices)", 
+                options.instanceId, options.totalProducers, startDeviceId, endDeviceId - 1, deviceCount);
+
+        Configuration flussConf = new Configuration();
+        flussConf.set(ConfigOptions.BOOTSTRAP_SERVERS, Collections.singletonList(options.bootstrap));
+        flussConf.set(ConfigOptions.CLIENT_WRITER_BUFFER_MEMORY_SIZE, options.writerBufferSize);
+        flussConf.set(ConfigOptions.CLIENT_WRITER_BATCH_SIZE, options.writerBatchSize);
+        // Set batch timeout from environment variable (default: 50ms for optimal throughput)
+        String batchTimeoutStr = getEnv("CLIENT_WRITER_BATCH_TIMEOUT", "50ms");
+        try {
+            // Parse duration string like "10ms", "50ms", etc.
+            long millis = Long.parseLong(batchTimeoutStr.replaceAll("[^0-9]", ""));
+            flussConf.set(ConfigOptions.CLIENT_WRITER_BATCH_TIMEOUT, Duration.ofMillis(millis));
+            LOG.info("Set CLIENT_WRITER_BATCH_TIMEOUT to {}ms", millis);
+        } catch (Exception e) {
+            LOG.warn("Failed to parse CLIENT_WRITER_BATCH_TIMEOUT '{}', using default 50ms", batchTimeoutStr);
+            flussConf.set(ConfigOptions.CLIENT_WRITER_BATCH_TIMEOUT, Duration.ofMillis(50));
+        }
+
+        
+        // Start Prometheus metrics server
+        ProducerMetrics metrics = new ProducerMetrics(8080);
+        metrics.start();
+        LOG.info("Prometheus metrics server started on port 8080");
+
+        TablePath tablePath = TablePath.of(options.database, options.table);
+
+        try (Connection connection = ConnectionFactory.createConnection(flussConf)) {
+            ensureSchema(connection, tablePath, options.bucketCount);
+
+            try (Table table = connection.getTable(tablePath)) {
+                AtomicBoolean running = new AtomicBoolean(true);
+                Runtime.getRuntime()
+                        .addShutdownHook(new Thread(() -> shutdown(running), "fluss-producer-shutdown"));
+
+                // Create thread pool for parallel writing
+                int numThreads = options.numWriterThreads;
+                ExecutorService executor = Executors.newFixedThreadPool(numThreads);
+                CountDownLatch completionLatch = new CountDownLatch(numThreads);
+                
+                // Shared counters for statistics
+                LongAdder totalSent = new LongAdder();
+                AtomicLong startNano = new AtomicLong(System.nanoTime());
+                AtomicLong lastStatsNano = new AtomicLong(startNano.get());
+                
+                // Rate control: distribute rate across threads
+                // Each thread should produce: totalRate / numThreads records per second
+                double ratePerThread = options.recordsPerSecond > 0 
+                        ? (double) options.recordsPerSecond / numThreads 
+                        : 0.0;
+                long nanosPerRecordPerThread = ratePerThread > 0 
+                        ? (long) (TimeUnit.SECONDS.toNanos(1) / ratePerThread)
+                        : 0;
+                
+                LOG.info("Target rate: {} rec/s total, {} rec/s per thread, {} writer threads", 
+                        options.recordsPerSecond, String.format(Locale.ROOT, "%.2f", ratePerThread), numThreads);
+
+                // Divide devices among threads
+                int devicesPerThread = (deviceCount + numThreads - 1) / numThreads; // Ceiling division
+                
+                // Start writer threads
+                for (int threadId = 0; threadId < numThreads; threadId++) {
+                    int threadStartDevice = startDeviceId + threadId * devicesPerThread;
+                    int threadEndDevice = Math.min(threadStartDevice + devicesPerThread, endDeviceId);
+                    
+                    if (threadStartDevice >= endDeviceId) {
+                        // No devices for this thread
+                        completionLatch.countDown();
+                        continue;
+                    }
+                    
+                    final int finalThreadId = threadId;
+                    final int finalThreadStartDevice = threadStartDevice;
+                    final int finalThreadEndDevice = threadEndDevice;
+                    
+                    executor.submit(() -> {
+                        try {
+                            UpsertWriter writer = table.newUpsert().createWriter();
+                            DeviceRangeGenerator generator = new DeviceRangeGenerator(
+                                    finalThreadStartDevice, 
+                                    finalThreadEndDevice);
+                            
+                            long threadSent = 0;
+                            long threadStartNano = System.nanoTime();
+                            long stopAtCount = options.totalRecords > 0 ? options.totalRecords : Long.MAX_VALUE;
+                            long stopAtTime = options.runDuration.isZero()
+                                    ? Long.MAX_VALUE
+                                    : System.nanoTime() + options.runDuration.toNanos();
+                            
+                            LOG.info("[Thread {}] Starting - will stop at {} total records", finalThreadId, stopAtCount);
+                            
+                            while (running.get() && totalSent.sum() < stopAtCount && System.nanoTime() < stopAtTime) {
+                                SensorDataMinimal record = generator.next();
+                                writeToFluss(writer, record);
+                                threadSent++;
+                                totalSent.increment();
+                                long currentTotal = totalSent.sum();
+                                metrics.recordWrite();
+                                
+                                if (currentTotal % 10 == 0) {
+                                    LOG.info("[Thread {}] Generated record {} (device_id={}, total={})", 
+                                            finalThreadId, threadSent, record.getSensorId(), currentTotal);
+                                }
+
+                                if (threadSent % options.flushEvery == 0) {
+                                    writer.flush();
+                                }
+
+                                // Check if we've reached the total count
+                                if (stopAtCount != Long.MAX_VALUE && currentTotal >= stopAtCount) {
+                                    LOG.info("[Thread {}] Reached target count of {} records, stopping", finalThreadId, stopAtCount);
+                                    break;
+                                }
+                                
+                                if (currentTotal % options.statsEvery == 0) {
+                                    long now = System.nanoTime();
+                                    double overallRate = ratePerSecond(currentTotal, now - startNano.get());
+                                    double windowRate = ratePerSecond(options.statsEvery, now - lastStatsNano.get());
+                                    LOG.info(
+                                            "[Thread {}] Produced {} records (thread: {}, overall ~{} rec/s, last window ~{} rec/s)",
+                                            finalThreadId,
+                                            currentTotal,
+                                            threadSent,
+                                            String.format(Locale.ROOT, "%.0f", overallRate),
+                                            String.format(Locale.ROOT, "%.0f", windowRate));
+                                    lastStatsNano.set(now);
+                                    metrics.updateStats(currentTotal);
+                                }
+
+                                // Rate limiting per thread
+                                if (nanosPerRecordPerThread > 0) {
+                                    long target = threadStartNano + threadSent * nanosPerRecordPerThread;
+                                    long wait = target - System.nanoTime();
+                                    if (wait > 0) {
+                                        TimeUnit.NANOSECONDS.sleep(wait);
+                                    }
+                                }
+                            }
+
+                            writer.flush();
+                            LOG.info("[Thread {}] Stopped after emitting {} records", finalThreadId, threadSent);
+                        } catch (Exception e) {
+                            LOG.error("[Thread {}] Error in writer thread", finalThreadId, e);
+                        } finally {
+                            completionLatch.countDown();
+                        }
+                    });
+                }
+
+                // Wait for all threads to complete
+                completionLatch.await();
+                executor.shutdown();
+                if (!executor.awaitTermination(30, TimeUnit.SECONDS)) {
+                    executor.shutdownNow();
+                }
+
+                LOG.info("Producer instance {} stopped after emitting {} records total", 
+                        options.instanceId, totalSent.sum());
+            }
+        } finally {
+            metrics.stop();
+        }
+    }
+
+    private static void shutdown(AtomicBoolean running) {
+        LOG.info("Shutdown requested");
+        running.set(false);
+    }
+
+    private static void ensureSchema(Connection connection, TablePath tablePath, int bucketCount)
+            throws Exception {
+        try (Admin admin = connection.getAdmin()) {
+            admin.createDatabase(
+                            tablePath.getDatabaseName(),
+                            DatabaseDescriptor.builder().comment("IoT demo database").build(),
+                            true)
+                    .get();
+
+            // Schema matching AVRO schema from JDBCFlinkConsumer.java
+            // Only minimal fields from AVRO schema are stored in Fluss
+            Schema schema = Schema.newBuilder()
+                    .primaryKey("sensor_id")
+                    .column("sensor_id", DataTypes.INT())
+                    .column("sensor_type", DataTypes.INT())
+                    .column("temperature", DataTypes.DOUBLE())
+                    .column("humidity", DataTypes.DOUBLE())
+                    .column("pressure", DataTypes.DOUBLE())
+                    .column("battery_level", DataTypes.DOUBLE())
+                    .column("status", DataTypes.INT())
+                    .column("timestamp", DataTypes.BIGINT())
+                    .build();
+
+            TableDescriptor descriptor = TableDescriptor.builder()
+                    .schema(schema)
+                    .comment("Realtime sensor readings - matches AVRO schema from JDBCFlinkConsumer.java")
+                    .distributedBy(bucketCount, "sensor_id")
+                    .build();
+
+            admin.createTable(tablePath, descriptor, true).get();
+            LOG.info("Ensured Fluss table {} exists ({} buckets)", tablePath, bucketCount);
+        }
+    }
+
+    private static void writeToFluss(UpsertWriter writer, SensorDataMinimal data) throws Exception {
+        // Schema matching AVRO: sensor_id, sensor_type, temperature, humidity, pressure, 
+        //                      battery_level, status, timestamp
+        GenericRow row = new GenericRow(8);
+        row.setField(0, data.getSensorId());  // INT
+        row.setField(1, data.getSensorType());  // INT (1-8)
+        row.setField(2, data.getTemperature());
+        row.setField(3, data.getHumidity());
+        row.setField(4, data.getPressure());
+        row.setField(5, data.getBatteryLevel());
+        row.setField(6, data.getStatus()); // INT: 1=online, 2=offline, 3=maintenance, 4=error
+        row.setField(7, data.getTimestamp()); // LONG: timestamp in milliseconds
+
+        writer.upsert(row);
+    }
+
+    private record ProducerOptions(
+            String bootstrap,
+            String database,
+            String table,
+            int bucketCount,
+            long totalRecords,
+            Duration runDuration,
+            int recordsPerSecond,
+            int flushEvery,
+            MemorySize writerBufferSize,
+            MemorySize writerBatchSize,
+            int statsEvery,
+            int totalProducers,
+            int instanceId,
+            int numWriterThreads) {
+        private static ProducerOptions parse(String[] args) {
+            String bootstrap = "localhost:9124";
+            String database = "iot";
+            String table = "sensor_readings";
+            int bucketCount = 48; // Default to 48 buckets
+            long totalRecords = 0L;
+            Duration runDuration = Duration.ZERO;
+            int recordsPerSecond = getIntEnv("PRODUCER_RATE", 200000);
+            int flushEvery = getIntEnv("PRODUCER_FLUSH_EVERY", 200000);
+            int statsEvery = getIntEnv("PRODUCER_STATS_EVERY", 50_000);
+            String bufferSizeStr = getEnv("CLIENT_WRITER_BUFFER_MEMORY_SIZE", "2gb");
+            String batchSizeStr = getEnv("CLIENT_WRITER_BATCH_SIZE", "128mb");
+            MemorySize bufferSize = MemorySize.parse(bufferSizeStr);
+            MemorySize batchSize = MemorySize.parse(batchSizeStr);
+            int totalProducers = getIntEnv("TOTAL_PRODUCERS", 1);
+            int instanceId = getIntEnv("INSTANCE_ID", 0);
+            int numWriterThreads = getIntEnv("NUM_WRITER_THREADS", 8);
+
+            for (int i = 0; i < args.length; i++) {
+                String option = args[i];
+                String inlineValue = null;
+                int eqIdx = option.indexOf('=');
+                if (eqIdx > 0) {
+                    inlineValue = option.substring(eqIdx + 1);
+                    option = option.substring(0, eqIdx);
+                }
+
+                switch (option) {
+                    case "--bootstrap":
+                        bootstrap = inlineValue != null ? inlineValue : requireValue(option, args, ++i);
+                        break;
+                    case "--database":
+                        database = inlineValue != null ? inlineValue : requireValue(option, args, ++i);
+                        break;
+                    case "--table":
+                        table = inlineValue != null ? inlineValue : requireValue(option, args, ++i);
+                        break;
+                    case "--buckets":
+                        bucketCount = Integer.parseInt(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    case "--count":
+                        totalRecords = Long.parseLong(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    case "--duration":
+                        String durationValue = inlineValue != null ? inlineValue : requireValue(option, args, ++i);
+                        runDuration = Duration.parse("PT" + durationValue.toUpperCase(Locale.ROOT));
+                        break;
+                    case "--rate":
+                        recordsPerSecond = Integer.parseInt(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    case "--flush":
+                        flushEvery = Integer.parseInt(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    case "--stats":
+                        statsEvery = Integer.parseInt(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    case "--total-producers":
+                        totalProducers = Integer.parseInt(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    case "--instance-id":
+                        instanceId = Integer.parseInt(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    case "--writer-threads":
+                        numWriterThreads = Integer.parseInt(inlineValue != null ? inlineValue : requireValue(option, args, ++i));
+                        break;
+                    default:
+                        throw new IllegalArgumentException("Unknown argument: " + option);
+                }
+            }
+
+            return new ProducerOptions(
+                    bootstrap,
+                    database,
+                    table,
+                    bucketCount,
+                    totalRecords,
+                    runDuration,
+                    recordsPerSecond,
+                    flushEvery,
+                    bufferSize,
+                    batchSize,
+                    statsEvery,
+                    totalProducers,
+                    instanceId,
+                    numWriterThreads);
+        }
+
+        private static String requireValue(String option, String[] args, int index) {
+            if (index >= args.length) {
+                throw new IllegalArgumentException("Missing value for " + option);
+            }
+            return args[index];
+        }
+    }
+
+    /**
+     * Get integer value from environment variable, or return default if not set or invalid.
+     */
+    private static int getIntEnv(String envVar, int defaultValue) {
+        String value = System.getenv(envVar);
+        if (value == null || value.trim().isEmpty()) {
+            return defaultValue;
+        }
+        try {
+            return Integer.parseInt(value.trim());
+        } catch (NumberFormatException e) {
+            LOG.warn("Invalid value for environment variable {}: {}, using default: {}", envVar, value, defaultValue);
+            return defaultValue;
+        }
+    }
+
+    /**
+     * Get string value from environment variable, or return default if not set.
+     */
+    private static String getEnv(String envVar, String defaultValue) {
+        String value = System.getenv(envVar);
+        if (value == null || value.trim().isEmpty()) {
+            return defaultValue;
+        }
+        return value.trim();
+    }
+
+    private static double ratePerSecond(long records, long elapsedNanos) {
+        if (elapsedNanos <= 0) {
+            return 0d;
+        }
+        return records / (elapsedNanos / 1_000_000_000d);
+    }
+
+    /**
+     * Generator for a specific device ID range.
+     * Each device has its own independent state and generates data.
+     */
+    private static final class DeviceRangeGenerator {
+        private final Random random = new Random();
+        
+        // Per-device generators (one Random per device for independent state)
+        private final List<DeviceGenerator> deviceGenerators;
+
+        private DeviceRangeGenerator(int startDeviceId, int endDeviceId) {
+            // Create a generator for each device in the range
+            this.deviceGenerators = new ArrayList<>(endDeviceId - startDeviceId);
+            for (int deviceId = startDeviceId; deviceId < endDeviceId; deviceId++) {
+                deviceGenerators.add(new DeviceGenerator(deviceId));
+            }
+        }
+
+        /**
+         * Generate next sensor data record from a random device in this range.
+         * Device IDs are integers matching minimal schema.
+         * Fluss will hash the sensor_id to determine the bucket (0-47).
+         */
+        private SensorDataMinimal next() {
+            DeviceGenerator deviceGen = deviceGenerators.get(random.nextInt(deviceGenerators.size()));
+            return deviceGen.next();
+        }
+    }
+
+    /**
+     * Generator for a single device with independent state.
+     * Generates data matching minimal schema fields only (same as JDBCFlinkConsumer.java reads from Pulsar).
+     * Flink job will add default values for missing fields at the sink level.
+     */
+    private static final class DeviceGenerator {
+        private final int deviceId;
+        
+        // Sensor types as integers (matching minimal schema):
+        // 1=temperature, 2=humidity, 3=pressure, 4=motion, 5=light, 6=co2, 7=noise, 8=multisensor
+        private static final int[] SENSOR_TYPES = {1, 2, 3, 4, 5, 6, 7, 8};
+        
+        private final Random random;
+
+        private DeviceGenerator(int deviceId) {
+            this.deviceId = deviceId;
+            // Use device ID as seed for reproducible but independent per-device randomness
+            this.random = new Random(deviceId);
+        }
+
+        private SensorDataMinimal next() {
+            SensorDataMinimal data = new SensorDataMinimal();
+            
+            // Only fields matching AVRO schema:
+            data.setSensorId(deviceId);  // INT: device ID as integer
+            data.setSensorType(SENSOR_TYPES[random.nextInt(SENSOR_TYPES.length)]);  // INT: 1-8
+            
+            // Sensor readings (realistic ranges)
+            data.setTemperature(10 + random.nextDouble() * 30);  // 10-40°C
+            data.setHumidity(20 + random.nextDouble() * 60);     // 20-80%
+            data.setPressure(980 + random.nextDouble() * 50);    // 980-1030 hPa
+            
+            // Device metrics
+            data.setBatteryLevel(20 + random.nextDouble() * 80);  // 20-100%
+            
+            // Status (1=online, 2=offline, 3=maintenance, 4=error)
+            // 85% online, 5% offline, 5% maintenance, 5% error
+            int statusRoll = random.nextInt(100);
+            if (statusRoll < 85) {
+                data.setStatus(1);  // online
+            } else if (statusRoll < 90) {
+                data.setStatus(2);  // offline
+            } else if (statusRoll < 95) {
+                data.setStatus(3);  // maintenance
+            } else {
+                data.setStatus(4);  // error
+            }
+            
+            // Timestamp in milliseconds since epoch
+            data.setTimestamp(System.currentTimeMillis());
+            
+            return data;
+        }
+    }
+}
+
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/producer/ProducerMetrics.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/producer/ProducerMetrics.java
new file mode 100644
index 0000000..d02509e
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/producer/ProducerMetrics.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.producer;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.net.InetSocketAddress;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.LongAdder;
+
+import com.sun.net.httpserver.HttpServer;
+
+/**
+ * Simple Prometheus metrics server for the producer.
+ * Exposes metrics on port 8080 at /metrics endpoint.
+ */
+public class ProducerMetrics {
+    private static final Logger LOG = LoggerFactory.getLogger(ProducerMetrics.class);
+    
+    private final LongAdder totalRecords = new LongAdder();
+    private final AtomicLong startTime = new AtomicLong(System.currentTimeMillis());
+    private final AtomicLong lastStatsTime = new AtomicLong(System.currentTimeMillis());
+    private final AtomicLong lastStatsRecords = new AtomicLong(0);
+    
+    private HttpServer server;
+    private final int port;
+    
+    public ProducerMetrics(int port) {
+        this.port = port;
+    }
+    
+    public void start() throws IOException {
+        // Bind to 0.0.0.0 to make it accessible from outside the container
+        server = HttpServer.create(new InetSocketAddress("0.0.0.0", port), 0);
+        server.createContext("/metrics", this::handleMetrics);
+        server.setExecutor(null); // Use default executor
+        server.start();
+        LOG.info("Producer metrics server started on port {} (bound to 0.0.0.0)", port);
+    }
+    
+    public void stop() {
+        if (server != null) {
+            server.stop(0);
+            LOG.info("Producer metrics server stopped");
+        }
+    }
+    
+    public void recordWrite() {
+        totalRecords.increment();
+    }
+    
+    public void updateStats(long records) {
+        lastStatsRecords.set(records);
+        lastStatsTime.set(System.currentTimeMillis());
+    }
+    
+    private void handleMetrics(com.sun.net.httpserver.HttpExchange exchange) throws IOException {
+        long currentTime = System.currentTimeMillis();
+        long total = totalRecords.sum();
+        long elapsedSeconds = (currentTime - startTime.get()) / 1000;
+        long windowRecords = lastStatsRecords.get();
+        long windowElapsedSeconds = (currentTime - lastStatsTime.get()) / 1000;
+        
+        double overallRate = elapsedSeconds > 0 ? (double) total / elapsedSeconds : 0.0;
+        double windowRate = windowElapsedSeconds > 0 ? (double) windowRecords / windowElapsedSeconds : 0.0;
+        
+        StringBuilder response = new StringBuilder();
+        response.append("# HELP fluss_producer_records_total Total number of records written to Fluss\n");
+        response.append("# TYPE fluss_producer_records_total counter\n");
+        response.append("fluss_producer_records_total ").append(total).append("\n");
+        
+        response.append("# HELP fluss_producer_records_per_second Overall records per second\n");
+        response.append("# TYPE fluss_producer_records_per_second gauge\n");
+        response.append("fluss_producer_records_per_second ").append(String.format("%.2f", overallRate)).append("\n");
+        
+        response.append("# HELP fluss_producer_records_per_second_window Records per second in last window\n");
+        response.append("# TYPE fluss_producer_records_per_second_window gauge\n");
+        response.append("fluss_producer_records_per_second_window ").append(String.format("%.2f", windowRate)).append("\n");
+        
+        response.append("# HELP fluss_producer_uptime_seconds Producer uptime in seconds\n");
+        response.append("# TYPE fluss_producer_uptime_seconds gauge\n");
+        response.append("fluss_producer_uptime_seconds ").append(elapsedSeconds).append("\n");
+        
+        String responseStr = response.toString();
+        exchange.sendResponseHeaders(200, responseStr.length());
+        try (OutputStream os = exchange.getResponseBody()) {
+            os.write(responseStr.getBytes());
+        }
+    }
+}
+
diff --git a/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/setup/CreateTableWithBuckets.java b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/setup/CreateTableWithBuckets.java
new file mode 100644
index 0000000..bd9bebf
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/java/org/apache/fluss/benchmark/e2eplatformaws/setup/CreateTableWithBuckets.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.benchmark.e2eplatformaws.setup;
+
+import org.apache.fluss.client.Connection;
+import org.apache.fluss.client.ConnectionFactory;
+import org.apache.fluss.client.admin.Admin;
+import org.apache.fluss.config.ConfigOptions;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.metadata.DatabaseDescriptor;
+import org.apache.fluss.metadata.Schema;
+import org.apache.fluss.metadata.TableDescriptor;
+import org.apache.fluss.metadata.TablePath;
+import org.apache.fluss.types.DataTypes;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Collections;
+
+/**
+ * Utility to create a Fluss table with a specific number of buckets.
+ * This can be run before the producer starts to ensure the table exists with the correct bucket count.
+ */
+public final class CreateTableWithBuckets {
+    private static final Logger LOG = LoggerFactory.getLogger(CreateTableWithBuckets.class);
+
+    private CreateTableWithBuckets() {}
+
+    public static void main(String[] args) throws Exception {
+        if (args.length < 4) {
+            System.err.println("Usage: CreateTableWithBuckets <bootstrap> <database> <table> <buckets> [drop-if-exists]");
+            System.err.println("  bootstrap: Fluss coordinator address (e.g., localhost:9124)");
+            System.err.println("  database: Database name");
+            System.err.println("  table: Table name");
+            System.err.println("  buckets: Number of buckets (e.g., 48)");
+            System.err.println("  drop-if-exists: Optional, set to 'true' to drop table if it exists");
+            System.exit(1);
+        }
+
+        String bootstrap = args[0];
+        String database = args[1];
+        String table = args[2];
+        int buckets = Integer.parseInt(args[3]);
+        boolean dropIfExists = args.length > 4 && "true".equalsIgnoreCase(args[4]);
+
+        Configuration conf = new Configuration();
+        conf.set(ConfigOptions.BOOTSTRAP_SERVERS, Collections.singletonList(bootstrap));
+
+        TablePath tablePath = new TablePath(database, table);
+
+        try (Connection connection = ConnectionFactory.createConnection(conf);
+                Admin admin = connection.getAdmin()) {
+
+            // Create database if it doesn't exist
+            LOG.info("Creating database '{}' if it doesn't exist...", database);
+            admin.createDatabase(
+                            database,
+                            DatabaseDescriptor.builder().comment("IoT demo database").build(),
+                            true)
+                    .get();
+            LOG.info("Database '{}' ready", database);
+
+            // Check if table exists
+            boolean tableExists = false;
+            try {
+                admin.getTableInfo(tablePath).get();
+                tableExists = true;
+                LOG.info("Table '{}' already exists", tablePath);
+            } catch (Exception e) {
+                LOG.info("Table '{}' does not exist, will create it", tablePath);
+            }
+
+            // Drop table if it exists and dropIfExists is true
+            if (tableExists && dropIfExists) {
+                LOG.info("Dropping existing table '{}'...", tablePath);
+                admin.dropTable(tablePath, true).get();
+                LOG.info("Table '{}' dropped", tablePath);
+                tableExists = false;
+            }
+
+            // Create table if it doesn't exist
+            if (!tableExists) {
+                LOG.info("Creating table '{}' with {} buckets...", tablePath, buckets);
+
+                // Schema matching AVRO schema from JDBCFlinkConsumer.java
+                // Only minimal fields from AVRO schema are stored in Fluss
+                // Remaining fields will be set to default values at the sink (matching JDBCFlinkConsumer.java)
+                Schema schema = Schema.newBuilder()
+                        .primaryKey("sensor_id")  // Primary key (maps from sensorId in AVRO)
+                        .column("sensor_id", DataTypes.INT())  // sensorId from AVRO
+                        .column("sensor_type", DataTypes.INT())  // sensorType from AVRO (1-8)
+                        .column("temperature", DataTypes.DOUBLE())
+                        .column("humidity", DataTypes.DOUBLE())
+                        .column("pressure", DataTypes.DOUBLE())
+                        .column("battery_level", DataTypes.DOUBLE())
+                        .column("status", DataTypes.INT())  // 1=online, 2=offline, 3=maintenance, 4=error
+                        .column("timestamp", DataTypes.BIGINT())  // timestamp-millis from AVRO
+                        .build();
+
+                TableDescriptor descriptor = TableDescriptor.builder()
+                        .schema(schema)
+                        .comment("Realtime sensor readings - matches AVRO schema from JDBCFlinkConsumer.java")
+                        .distributedBy(buckets, "sensor_id")
+                        .build();
+
+                admin.createTable(tablePath, descriptor, false).get();
+                LOG.info("Successfully created table '{}' with {} buckets", tablePath, buckets);
+            } else {
+                // Verify bucket count
+                try {
+                    var tableInfo = admin.getTableInfo(tablePath).get();
+                    int actualBuckets = tableInfo.getNumBuckets();
+                    LOG.info("Table '{}' already exists with {} buckets", tablePath, actualBuckets);
+                    if (actualBuckets != buckets) {
+                        LOG.warn("WARNING: Table has {} buckets, but requested {} buckets. " +
+                                "Set drop-if-exists=true to recreate the table.", actualBuckets, buckets);
+                        System.exit(1);
+                    } else {
+                        LOG.info("Table '{}' has the correct number of buckets ({})", tablePath, buckets);
+                    }
+                } catch (Exception e) {
+                    LOG.error("Failed to verify table bucket count", e);
+                    System.exit(1);
+                }
+            }
+        }
+
+        LOG.info("Done!");
+    }
+}
+
diff --git a/e2e-iot/fluss_flink_realtime/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory b/e2e-iot/fluss_flink_realtime/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory
new file mode 100644
index 0000000..9ba0f01
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory
@@ -0,0 +1 @@
+org.apache.fluss.flink.catalog.FlinkCatalogFactory
diff --git a/e2e-iot/fluss_flink_realtime/test-local.sh b/e2e-iot/fluss_flink_realtime/test-local.sh
new file mode 100755
index 0000000..b87e3b5
--- /dev/null
+++ b/e2e-iot/fluss_flink_realtime/test-local.sh
@@ -0,0 +1,219 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+
+# Local test script for Fluss producer and Flink job
+# Tests the minimal schema with default values added at sink
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
+cd "${PROJECT_ROOT}"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}=== Local Fluss + Flink Test ===${NC}"
+echo ""
+
+# Check prerequisites
+echo -e "${YELLOW}[1/7] Checking prerequisites...${NC}"
+if ! command -v mvn &> /dev/null; then
+    echo -e "${RED}ERROR: Maven not found. Please install Maven.${NC}"
+    exit 1
+fi
+
+if [ ! -d "demos/demo/deploy_local_kind_fluss/fluss-0.8.0-incubating" ]; then
+    echo -e "${RED}ERROR: Fluss 0.8.0 not found at demos/demo/deploy_local_kind_fluss/fluss-0.8.0-incubating${NC}"
+    echo "Please extract fluss-0.8.0-incubating.tgz to that location"
+    exit 1
+fi
+
+FLUSS_DIR="${PROJECT_ROOT}/demos/demo/deploy_local_kind_fluss/fluss-0.8.0-incubating"
+JAR_PATH="${SCRIPT_DIR}/target/fluss-flink-realtime-demo.jar"
+
+# Build the JAR
+echo -e "${YELLOW}[2/7] Building demo JAR...${NC}"
+mvn -pl demos/demo/fluss_flink_realtime_demo -am clean package -DskipTests
+if [ ! -f "${JAR_PATH}" ]; then
+    echo -e "${RED}ERROR: JAR not found at ${JAR_PATH}${NC}"
+    exit 1
+fi
+echo -e "${GREEN}✓ JAR built successfully${NC}"
+echo ""
+
+# Start Fluss local cluster
+echo -e "${YELLOW}[3/7] Starting Fluss local cluster...${NC}"
+if [ -f "${FLUSS_DIR}/bin/local-cluster.sh" ]; then
+    # Check if already running
+    if pgrep -f "fluss.*coordinator" > /dev/null; then
+        echo -e "${YELLOW}Fluss cluster appears to be running. Skipping start.${NC}"
+    else
+        "${FLUSS_DIR}/bin/local-cluster.sh" start
+        echo "Waiting for Fluss to be ready..."
+        sleep 10
+        # Wait for coordinator to be ready
+        for i in {1..30}; do
+            if nc -z localhost 9123 2>/dev/null; then
+                echo -e "${GREEN}✓ Fluss coordinator is ready${NC}"
+                break
+            fi
+            if [ $i -eq 30 ]; then
+                echo -e "${RED}ERROR: Fluss coordinator not ready after 30 seconds${NC}"
+                exit 1
+            fi
+            sleep 1
+        done
+    fi
+else
+    echo -e "${RED}ERROR: Fluss local-cluster.sh not found${NC}"
+    exit 1
+fi
+echo ""
+
+# Create table with 48 buckets
+echo -e "${YELLOW}[4/7] Creating Fluss table with 48 buckets...${NC}"
+java --add-opens=java.base/java.util=ALL-UNNAMED \
+     --add-opens=java.base/java.lang=ALL-UNNAMED \
+     --add-opens=java.base/java.nio=ALL-UNNAMED \
+     --add-opens=java.base/java.time=ALL-UNNAMED \
+     -cp "${JAR_PATH}" \
+     org.apache.fluss.benchmark.e2eplatformaws.setup.CreateTableWithBuckets \
+     localhost:9123 iot sensor_readings 48 true
+
+if [ $? -eq 0 ]; then
+    echo -e "${GREEN}✓ Table created successfully${NC}"
+else
+    echo -e "${RED}ERROR: Failed to create table${NC}"
+    exit 1
+fi
+echo ""
+
+# Verify table exists
+echo -e "${YELLOW}[5/7] Verifying table exists...${NC}"
+java --add-opens=java.base/java.util=ALL-UNNAMED \
+     --add-opens=java.base/java.lang=ALL-UNNAMED \
+     --add-opens=java.base/java.nio=ALL-UNNAMED \
+     --add-opens=java.base/java.time=ALL-UNNAMED \
+     -cp "${JAR_PATH}" \
+     org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussMetadataInspector localhost:9123 iot 2>/dev/null | grep -q "sensor_readings" && \
+     echo -e "${GREEN}✓ Table verified${NC}" || echo -e "${YELLOW}Warning: Could not verify table${NC}"
+echo ""
+
+# Start producer (instance 0 of 1)
+echo -e "${YELLOW}[6/7] Starting producer (instance 0 of 1, 100K devices)...${NC}"
+echo "Producer will run in background. Press Ctrl+C to stop."
+echo ""
+java --add-opens=java.base/java.util=ALL-UNNAMED \
+     --add-opens=java.base/java.lang=ALL-UNNAMED \
+     --add-opens=java.base/java.nio=ALL-UNNAMED \
+     --add-opens=java.base/java.time=ALL-UNNAMED \
+     -cp "${JAR_PATH}" \
+     org.apache.fluss.benchmark.e2eplatformaws.producer.FlussSensorProducerAppMultiInstance \
+     --bootstrap localhost:9123 \
+     --database iot \
+     --table sensor_readings \
+     --buckets 48 \
+     --total-producers 1 \
+     --instance-id 0 \
+     --rate 10000 \
+     --writer-threads 4 \
+     --flush 10000 \
+     --stats 50000 &
+PRODUCER_PID=$!
+
+echo "Producer PID: ${PRODUCER_PID}"
+echo "Waiting 10 seconds for producer to start generating data..."
+sleep 10
+echo ""
+
+# Check if Flink is available
+FLINK_DIR="${PROJECT_ROOT}/flink-1.20.3"
+if [ ! -d "${FLINK_DIR}" ]; then
+    echo -e "${YELLOW}[7/7] Flink not found at ${FLINK_DIR}${NC}"
+    echo "Skipping Flink job. You can run it manually:"
+    echo ""
+    echo "  ${FLINK_DIR}/bin/flink run \\"
+    echo "    -c org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob \\"
+    echo "    ${JAR_PATH} \\"
+    echo "    --bootstrap localhost:9123 --database iot --table sensor_readings --window-minutes 1"
+    echo ""
+    echo -e "${GREEN}✓ Producer is running (PID: ${PRODUCER_PID})${NC}"
+    echo "To stop the producer: kill ${PRODUCER_PID}"
+    echo "To stop Fluss: ${FLUSS_DIR}/bin/local-cluster.sh stop"
+    exit 0
+fi
+
+# Start Flink job
+echo -e "${YELLOW}[7/7] Starting Flink aggregation job...${NC}"
+echo "Flink job will read from Fluss and add default values at sink level"
+echo ""
+
+# Check if Flink cluster is running
+if ! nc -z localhost 8081 2>/dev/null; then
+    echo -e "${YELLOW}Starting Flink cluster...${NC}"
+    "${FLINK_DIR}/bin/start-cluster.sh"
+    sleep 5
+fi
+
+# Submit Flink job
+"${FLINK_DIR}/bin/flink run" \
+    -c org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob \
+    "${JAR_PATH}" \
+    --bootstrap localhost:9123 \
+    --database iot \
+    --table sensor_readings \
+    --window-minutes 1
+
+FLINK_EXIT_CODE=$?
+
+echo ""
+echo -e "${GREEN}=== Test Summary ===${NC}"
+echo "Producer PID: ${PRODUCER_PID}"
+echo "Flink Job Exit Code: ${FLINK_EXIT_CODE}"
+echo ""
+echo "To stop producer: kill ${PRODUCER_PID}"
+echo "To stop Fluss: ${FLUSS_DIR}/bin/local-cluster.sh stop"
+echo "To stop Flink: ${FLINK_DIR}/bin/stop-cluster.sh"
+echo ""
+echo "To view Flink UI: http://localhost:8081"
+echo "To check producer logs: Check console output above"
+echo ""
+
+# Cleanup function
+cleanup() {
+    echo ""
+    echo -e "${YELLOW}Cleaning up...${NC}"
+    if kill -0 ${PRODUCER_PID} 2>/dev/null; then
+        echo "Stopping producer..."
+        kill ${PRODUCER_PID} 2>/dev/null || true
+    fi
+    echo "Done. Fluss and Flink clusters are still running."
+    echo "Stop them manually if needed."
+}
+
+trap cleanup EXIT INT TERM
+
+# Wait for user interrupt
+echo -e "${GREEN}Test is running. Press Ctrl+C to stop producer and exit.${NC}"
+wait ${PRODUCER_PID}
+
diff --git a/e2e-iot/high-infra/.gitignore b/e2e-iot/high-infra/.gitignore
new file mode 100644
index 0000000..041d7c6
--- /dev/null
+++ b/e2e-iot/high-infra/.gitignore
@@ -0,0 +1,17 @@
+# Terraform
+*.tfstate
+*.tfstate.*
+.terraform/
+.terraform.lock.hcl
+# terraform.tfvars - commented out to allow tracking mandatory terraform.tfvars
+# *.tfvars.backup
+
+# Helm charts (downloaded)
+# helm-charts/fluss/ - commented out to allow tracking mandatory helm charts
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
diff --git a/e2e-iot/high-infra/DEPLOY-STEPS.md b/e2e-iot/high-infra/DEPLOY-STEPS.md
new file mode 100644
index 0000000..9926923
--- /dev/null
+++ b/e2e-iot/high-infra/DEPLOY-STEPS.md
@@ -0,0 +1,459 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Deployment Steps Guide
+
+This document describes the step-by-step deployment process for the Fluss high-infrastructure setup, starting from after Terraform infrastructure deployment.
+
+## Prerequisites
+
+- Terraform infrastructure deployed (EKS cluster, node groups, ECR repositories)
+- `kubectl` configured and connected to the EKS cluster
+- `helm` installed
+- AWS CLI configured with appropriate permissions
+- ECR images pushed (fluss, fluss-demo)
+
+## Step 1: Update Kubeconfig
+
+After Terraform deployment, update your kubeconfig to connect to the EKS cluster:
+
+```bash
+cd aws-deploy-fluss/high-infra/terraform
+aws eks update-kubeconfig --region us-west-2 --name fluss-eks-cluster
+
+# Verify connection
+kubectl cluster-info
+```
+
+## Step 2: Setup Local NVMe Storage (for Tablet Servers)
+
+Configure local NVMe storage for Fluss tablet servers:
+
+```bash
+cd aws-deploy-fluss/high-infra/k8s/storage
+./setup-local-storage.sh
+```
+
+This script:
+- Creates a `local-storage` StorageClass
+- Creates PersistentVolumes for tablet servers (3 PVs, 500Gi each)
+- Configures node affinity to tablet-server nodes
+
+**Verify:**
+```bash
+kubectl get storageclass local-storage
+kubectl get pv -l component=tablet-server
+```
+
+## Step 3: Deploy All Components
+
+Deploy ZooKeeper, Fluss, Flink, and Monitoring stack:
+
+```bash
+cd aws-deploy-fluss/high-infra/k8s
+
+# Deploy with ECR images
+./deploy.sh fluss \
+  343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo \
+  latest \
+  343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss
+```
+
+**What gets deployed:**
+1. **Namespace**: Creates `fluss` namespace
+2. **ZooKeeper**: StatefulSet with 1 replica
+3. **Fluss**: 
+   - Coordinator (1 replica)
+   - Tablet Servers (3 replicas)
+   - Uses ECR image: `343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss:latest`
+4. **Flink**:
+   - JobManager (1 replica)
+   - TaskManager (2 replicas)
+   - Uses ECR image: `343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo:latest`
+   - JAR embedded at: `/opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+5. **Monitoring**:
+   - Prometheus (kube-prometheus-stack)
+   - Grafana
+   - ServiceMonitors and PodMonitors for metrics scraping
+6. **Grafana Dashboard**: ConfigMap with Fluss & Flink dashboard
+
+**Wait for components to be ready:**
+```bash
+# Check Fluss pods
+kubectl get pods -n fluss
+
+# Check monitoring pods
+kubectl get pods -n monitoring
+
+# Wait for all pods to be Running
+kubectl wait --for=condition=ready pod -l app=zookeeper -n fluss --timeout=120s
+kubectl wait --for=condition=ready pod -l app.kubernetes.io/component=coordinator -n fluss --timeout=300s
+kubectl wait --for=condition=ready pod -l app=flink,component=jobmanager -n fluss --timeout=120s
+```
+
+## Step 4: Deploy Multi-Instance Producer Job
+
+Deploy 8 producer instances (2 per node across 4 producer nodes) with 128 buckets:
+
+### Option 1: Use Multi-Instance Script (Recommended)
+
+```bash
+cd aws-deploy-fluss/high-infra/k8s/jobs
+
+# Deploy 8 producer instances with 128 buckets
+export BUCKETS=128
+./deploy-producer-multi-instance.sh --wait
+```
+
+**Multi-Instance Configuration (defaults):**
+- **Total Instances**: 8 (instance IDs 0-7)
+- **Distribution**: 2 pods per producer node (4 nodes total)
+- **Rate per instance**: 250,000 records/second
+- **Total rate**: 2,000,000 records/second
+- **Flush**: Every 5,000 records (optimal for throughput)
+- **Batch Timeout**: 90ms (optimal for batching)
+- **Buffer Size**: 2gb
+- **Batch Size**: 128mb
+- **Memory**: 4Gi request, 16Gi limit per instance
+- **CPU**: 2000m request, 8000m limit per instance
+- **Writer Threads**: 48 per instance
+- **Buckets**: 128 (must match table bucket count)
+
+**Customize if needed:**
+```bash
+./deploy-producer-optimal.sh \
+  --rate 200000 \
+  --flush 5000 \
+  --batch-timeout 90ms
+```
+
+### Option 2: Use Multi-Instance Script with Custom Parameters
+
+```bash
+cd aws-deploy-fluss/high-infra/k8s/jobs
+
+# Deploy 8 producer instances with custom parameters
+export BUCKETS=128
+export PRODUCER_RATE=250000
+export TOTAL_PRODUCERS=8
+./deploy-producer-multi-instance.sh --wait
+```
+
+**Configuration:**
+- **Image**: `343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo:latest`
+- **JAR Path**: `/opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+- **Rate per instance**: 250,000 records/second (default, customizable)
+- **Total rate**: 2,000,000 records/second (8 instances × 250K)
+- **Flush**: Every 5,000 records (optimal for throughput)
+- **Batch Timeout**: 90ms (optimal for batching)
+- **Buckets**: 128 (must match table bucket count)
+- **Nodes**: Runs on `producer` node group (c5.2xlarge), 2 pods per node
+
+**Note**: See `k8s/jobs/PRODUCER_CONFIG.md` for detailed configuration options and performance tuning guidelines.
+
+**Monitor producers:**
+```bash
+# Check producer pod status (should see 8 pods, 2 per node)
+kubectl get pods -n fluss -l app=fluss-producer -o wide
+
+# View producer logs (all instances)
+kubectl logs -n fluss -l app=fluss-producer -f
+
+# View logs for specific instance
+kubectl logs -n fluss -l app=fluss-producer,job-name=fluss-producer-0 -f
+
+# Check pod distribution across nodes
+kubectl get pods -n fluss -l app=fluss-producer -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}{end}' | sort -k2
+
+# Check producer metrics
+kubectl port-forward -n fluss svc/fluss-producer-metrics 8080:8080
+# Then visit: http://localhost:8080/metrics
+```
+
+## Step 5: Submit Flink Aggregator Job
+
+Submit the Flink job that processes sensor data:
+
+```bash
+cd aws-deploy-fluss/high-infra/k8s/flink
+./submit-job-from-image.sh
+```
+
+**What this script does:**
+1. Cancels any existing Flink jobs
+2. Verifies JAR exists in Flink image at `/opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+3. Uploads JAR from the image's local filesystem to Flink cluster
+4. Submits job via REST API with:
+   - Entry class: `org.apache.fluss.benchmarks.flink.FlinkSensorAggregatorJob`
+   - Parallelism: 32
+   - Window: 1 minute
+   - **Scan mode**: `latest` (reads from latest position, not from beginning)
+
+**Note**: The JAR is embedded in the Flink image (`343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo:latest`), so no local file upload is needed. The script uses the JAR from the ECR image.
+
+**Monitor Flink job:**
+```bash
+# Check Flink pods
+kubectl get pods -n fluss -l app=flink
+
+# Access Flink Web UI
+kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081
+# Then visit: http://localhost:8081
+
+# View Flink job logs
+kubectl logs -n fluss -l app=flink,component=taskmanager -f
+```
+
+## Step 6: Deploy Grafana Dashboard
+
+Deploy the Grafana dashboard for monitoring:
+
+```bash
+cd aws-deploy-fluss/high-infra/k8s/monitoring
+./deploy-dashboard.sh
+```
+
+**Access Grafana:**
+```bash
+# Port-forward Grafana
+kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80
+
+# Then visit: http://localhost:3000
+# Username: admin
+# Password: admin123
+```
+
+**Dashboard includes:**
+- Producer metrics (records/sec, latency)
+- Flink metrics (input/output rates, aggregator records)
+- Fluss Coordinator metrics (cluster status, table/bucket counts)
+- Fluss Tablet Server metrics (messages rate, bytes rate, replication)
+- JVM metrics (CPU, memory for coordinator and tablet servers)
+
+## Step 7: Verify Deployment
+
+### Check All Pods
+
+```bash
+# Fluss namespace
+kubectl get pods -n fluss
+
+# Expected:
+# - zk-0: Running
+# - coordinator-server-0: Running
+# - tablet-server-0, tablet-server-1, tablet-server-2: Running
+# - flink-jobmanager-*: Running
+# - flink-taskmanager-0, flink-taskmanager-1: Running
+# - fluss-producer-*: Running
+
+# Monitoring namespace
+kubectl get pods -n monitoring
+```
+
+### Check Node Placement
+
+```bash
+# Verify nodes are on correct instance types
+kubectl get nodes -l node-type=coordinator
+kubectl get nodes -l node-type=tablet-server
+kubectl get nodes -l node-type=flink-jobmanager
+kubectl get nodes -l node-type=flink-taskmanager
+kubectl get nodes -l node-type=producer
+
+# Check pod placement
+kubectl get pods -n fluss -o wide
+```
+
+### Verify Metrics Scraping
+
+```bash
+# Check ServiceMonitors
+kubectl get servicemonitor -n fluss
+
+# Check PodMonitors
+kubectl get podmonitor -n fluss
+
+# Port-forward Prometheus and check targets
+kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
+# Visit: http://localhost:9090/targets
+# All targets should be "UP"
+```
+
+### Verify Data Flow
+
+```bash
+# Check producer is writing data
+kubectl logs -n fluss -l app=fluss-producer --tail=50 | grep -i "records\|throughput"
+
+# Check Flink is processing data
+kubectl logs -n fluss -l app=flink,component=taskmanager --tail=50 | grep -i "aggregate\|records"
+
+# Check Fluss tablet servers are receiving data
+kubectl logs -n fluss -l app.kubernetes.io/component=tablet-server --tail=20
+```
+
+## Troubleshooting
+
+### Producer Not Starting
+
+**Issue**: Producer pod fails with "Unable to access jarfile"
+
+**Solution**: Ensure JAR path is correct in `producer-job.yaml`:
+- Correct path: `/opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+- Image must be built with JAR embedded
+
+### Flink Job Submission Fails
+
+**Issue**: Cannot upload JAR or submit job
+
+**Solution**: 
+- Ensure Flink JobManager pod is Running
+- Check Flink REST API is accessible: `kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081`
+- Verify JAR exists in Flink image:
+  ```bash
+  kubectl exec -n fluss <jobmanager-pod> -- test -f /opt/flink/usrlib/fluss-flink-realtime-demo.jar
+  ```
+- If JAR is missing, rebuild and push the image:
+  ```bash
+  cd aws-deploy-fluss/high-infra/k8s/flink
+  ./build-and-push.sh
+  ```
+- Then restart Flink pods to pull the new image
+
+### Metrics Not Appearing
+
+**Issue**: No metrics in Grafana/Prometheus
+
+**Solution**:
+1. Check ServiceMonitors/PodMonitors are deployed:
+   ```bash
+   kubectl get servicemonitor,podmonitor -n fluss
+   ```
+2. Verify Prometheus is scraping targets:
+   ```bash
+   kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
+   # Visit http://localhost:9090/targets
+   ```
+3. Check pod annotations:
+   ```bash
+   kubectl get pod -n fluss -o yaml | grep prometheus.io
+   ```
+
+### Pods Not Scheduling
+
+**Issue**: Pods stuck in Pending state
+
+**Solution**:
+1. Check node availability:
+   ```bash
+   kubectl get nodes
+   kubectl describe nodes
+   ```
+2. Check node selectors and tolerations match:
+   ```bash
+   kubectl describe pod <pod-name> -n fluss | grep -A 5 "Node-Selectors\|Tolerations"
+   ```
+3. Verify node labels:
+   ```bash
+   kubectl get nodes --show-labels
+   ```
+
+## Quick Reference Commands
+
+### Access Services
+
+```bash
+# Flink Web UI
+kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081
+
+# Grafana
+kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80
+
+# Prometheus
+kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
+
+# Producer Metrics
+kubectl port-forward -n fluss svc/fluss-producer-metrics 8080:8080
+```
+
+### View Logs
+
+```bash
+# Producer
+kubectl logs -n fluss -l app=fluss-producer -f
+
+# Flink JobManager
+kubectl logs -n fluss -l app=flink,component=jobmanager -f
+
+# Flink TaskManager
+kubectl logs -n fluss -l app=flink,component=taskmanager -f
+
+# Fluss Coordinator
+kubectl logs -n fluss -l app.kubernetes.io/component=coordinator -f
+
+# Fluss Tablet Servers
+kubectl logs -n fluss -l app.kubernetes.io/component=tablet-server -f
+```
+
+### Restart Components
+
+```bash
+# Restart producer (using optimal configuration)
+kubectl delete job -n fluss fluss-producer
+cd aws-deploy-fluss/high-infra/k8s/jobs
+./deploy-producer-optimal.sh
+
+# Restart Flink job
+cd aws-deploy-fluss/high-infra/k8s/flink
+./submit-job-from-image.sh
+```
+
+## Cleanup
+
+To remove all deployed components (but keep infrastructure):
+
+```bash
+# Delete producer job
+kubectl delete job -n fluss fluss-producer
+
+# Delete Flink job (via REST API or UI)
+kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081
+# Visit http://localhost:8081 and cancel job
+
+# Delete all components
+cd aws-deploy-fluss/high-infra/k8s
+kubectl delete -f flink/
+kubectl delete -f zookeeper/
+helm uninstall fluss -n fluss
+helm uninstall prometheus -n monitoring
+kubectl delete namespace fluss monitoring
+```
+
+## Notes
+
+- **ECR Images**: Ensure images are pushed to ECR before deployment
+- **Storage**: Local NVMe storage setup is required for tablet servers if using persistence
+- **Node Types**: All components are configured to run on specific node groups:
+  - Coordinator: c5.2xlarge
+  - Tablet Servers: i7i.8xlarge (with NVMe)
+  - Flink JobManager: c5.4xlarge
+  - Flink TaskManager: c5.4xlarge
+  - Producer: c5.2xlarge
+- **Parallelism**: Flink job is configured with parallelism 32 (16 slots per TaskManager × 2 TaskManagers)
+- **Scan Mode**: Flink job reads from `latest` position (configured via SQL hint `scan.startup.mode = 'latest'`), meaning it only processes new data and doesn't read historical data from the beginning
+
diff --git a/e2e-iot/high-infra/DEPLOYMENT_FIXES.md b/e2e-iot/high-infra/DEPLOYMENT_FIXES.md
new file mode 100644
index 0000000..8623b60
--- /dev/null
+++ b/e2e-iot/high-infra/DEPLOYMENT_FIXES.md
@@ -0,0 +1,177 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Deployment Fixes Documentation
+
+## Issues Fixed
+
+### 1. Grafana Dashboard Not Deployed
+
+**Problem:**
+- The `deploy.sh` script was applying the Grafana dashboard ConfigMap but not importing it via the Grafana API
+- This meant the dashboard ConfigMap existed but wasn't visible in Grafana UI
+- Users had to manually run `deploy-dashboard.sh` separately
+
+**Root Cause:**
+- The `deploy.sh` script only ran `kubectl apply` on the ConfigMap
+- Grafana's auto-discovery can be slow or unreliable
+- The separate `deploy-dashboard.sh` script imports via API for immediate visibility
+
+**Fix Applied:**
+- Updated `deploy.sh` step 8 to:
+  1. Apply the ConfigMap (as before)
+  2. Wait for Grafana pod to be ready
+  3. Extract dashboard JSON from ConfigMap
+  4. Import dashboard via Grafana REST API (`/api/dashboards/db`)
+  5. Verify import success
+
+**Location:**
+- File: `k8s/deploy.sh`
+- Step: `[8/9] Deploying Grafana dashboard...`
+
+**Result:**
+- Dashboard is now automatically imported and visible immediately after deployment
+- Falls back gracefully if Grafana pod isn't ready or API import fails
+
+---
+
+### 2. Producer Deployment Issues
+
+**Problem 1: JAR Path Incorrect**
+- Initial JAR path: `/app/fluss-flink-realtime-demo.jar`
+- Issue: The `fluss-demo` Docker image places the JAR at `/opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+- Error: `Error: Unable to access jarfile /app/fluss-flink-realtime-demo.jar`
+
+**Fix:**
+- Updated `producer-job.yaml` args to use: `/opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+
+**Problem 2: Entrypoint Script Missing**
+- Initial command: `/app/entrypoint.sh`
+- Issue: The `fluss-demo` Docker image doesn't include an entrypoint script
+- Error: `exec: "/app/entrypoint.sh": stat /app/entrypoint.sh: no such file or directory`
+
+**Fix:**
+- Updated `producer-job.yaml` command to: `java` (direct execution)
+- Updated args to: `-jar /opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+
+**Problem 3: InitContainer Environment Variable Substitution**
+- Initial setup: Used `${COORD_HOST}` and `${COORD_PORT}` environment variables
+- Issue: `envsubst` was replacing these variables before Kubernetes could inject them
+- Error: `nc -zv "" ""` (empty hostname and port)
+
+**Fix:**
+- Hardcoded coordinator hostname in initContainer script: `coordinator-server-hs.fluss.svc.cluster.local:9124`
+- This avoids `envsubst` substitution issues
+
+**Problem 4: Missing Environment Variable Defaults**
+- Issue: When `deploy.sh` runs without setting producer environment variables, `envsubst` leaves literal `${PRODUCER_RATE}` etc. in the YAML
+- Error: Kubernetes rejects the YAML with invalid resource values
+
+**Fix:**
+- Updated `deploy.sh` step 6 to set default values for all producer environment variables before applying `producer-job.yaml`
+- Defaults match those in `deploy-producer.sh`:
+  - `PRODUCER_RATE=2000`
+  - `PRODUCER_FLUSH_EVERY=20000`
+  - `PRODUCER_STATS_EVERY=1000`
+  - `CLIENT_WRITER_BUFFER_MEMORY_SIZE=128mb`
+  - `CLIENT_WRITER_BATCH_SIZE=16mb`
+  - `PRODUCER_MEMORY_REQUEST=2Gi`
+  - `PRODUCER_MEMORY_LIMIT=8Gi`
+  - `PRODUCER_CPU_REQUEST=1000m`
+  - `PRODUCER_CPU_LIMIT=4000m`
+  - `BOOTSTRAP=coordinator-server-hs.fluss.svc.cluster.local:9124`
+  - `DATABASE=iot`
+  - `TABLE=sensor_readings`
+  - `BUCKETS=12`
+
+**Location:**
+- File: `k8s/jobs/producer-job.yaml` (JAR path, command fixes)
+- File: `k8s/deploy.sh` (default values)
+- File: `k8s/jobs/deploy-producer.sh` (already had defaults)
+
+**Result:**
+- Producer job can be deployed via `deploy.sh` without manual configuration
+- Producer job can also be deployed via `deploy-producer.sh` with custom values
+- Both methods work reliably
+
+---
+
+## Summary of Changes
+
+### Files Modified:
+
+1. **`k8s/deploy.sh`**
+   - Added Grafana API import step after ConfigMap deployment
+   - Added default environment variable values for producer deployment
+   - Updated step numbering (now 9 steps instead of 8)
+
+2. **`k8s/jobs/producer-job.yaml`**
+   - Fixed JAR path: `/app/fluss-flink-realtime-demo.jar` → `/opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+   - Fixed command: `/app/entrypoint.sh` → `java`
+   - Fixed initContainer: Hardcoded coordinator hostname
+
+### Files Already Correct:
+
+- **`k8s/jobs/deploy-producer.sh`**: Already had proper defaults and error handling
+- **`k8s/monitoring/deploy-dashboard.sh`**: Already had API import logic
+
+---
+
+## Testing
+
+To verify the fixes:
+
+1. **Dashboard Deployment:**
+   ```bash
+   cd aws-deploy-fluss/high-infra/k8s
+   ./deploy.sh fluss <demo-image-repo> latest <fluss-image-repo>
+   # Wait for deployment to complete
+   kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80
+   # Open http://localhost:3000 and verify "Fluss & Flink Monitoring Dashboard" is visible
+   ```
+
+2. **Producer Deployment:**
+   ```bash
+   # Via deploy.sh (uses defaults)
+   cd aws-deploy-fluss/high-infra/k8s
+   ./deploy.sh fluss <demo-image-repo> latest <fluss-image-repo>
+   # Check producer pod
+   kubectl get pods -n fluss -l app=fluss-producer
+   kubectl logs -n fluss -l app=fluss-producer
+   
+   # Via deploy-producer.sh (custom values)
+   cd aws-deploy-fluss/high-infra/k8s/jobs
+   ./deploy-producer.sh --rate 20000 --buckets 3
+   ```
+
+---
+
+## Future Improvements
+
+1. **Dashboard Auto-Discovery:**
+   - Consider using Grafana's sidecar pattern for automatic dashboard discovery
+   - This would eliminate the need for API import
+
+2. **Producer Configuration:**
+   - Consider using a ConfigMap for producer configuration instead of environment variables
+   - This would make configuration changes easier without redeploying
+
+3. **Error Handling:**
+   - Add retry logic for Grafana API import
+   - Add validation for producer environment variables before deployment
+
diff --git a/e2e-iot/high-infra/DEPLOYMENT_INSTRUCTIONS.md b/e2e-iot/high-infra/DEPLOYMENT_INSTRUCTIONS.md
new file mode 100644
index 0000000..a299513
--- /dev/null
+++ b/e2e-iot/high-infra/DEPLOYMENT_INSTRUCTIONS.md
@@ -0,0 +1,700 @@
+# FLUSS 2-Million Messages Per Second - Deployment Instructions
+
+This document provides a comprehensive list of deployment instructions for the FLUSS 2-million-messages-per-second benchmark infrastructure.
+
+## Table of Contents
+
+1. [Prerequisites](#prerequisites)
+2. [Infrastructure Deployment (Terraform)](#infrastructure-deployment-terraform)
+3. [Kubernetes Configuration](#kubernetes-configuration)
+4. [Component Deployment](#component-deployment)
+5. [Storage Setup](#storage-setup)
+6. [Producer Deployment](#producer-deployment)
+7. [Flink Job Deployment](#flink-job-deployment)
+8. [Monitoring Setup](#monitoring-setup)
+9. [Verification](#verification)
+10. [Accessing Services](#accessing-services)
+11. [Troubleshooting](#troubleshooting)
+12. [Cleanup](#cleanup)
+
+---
+
+## Prerequisites
+
+Before starting deployment, ensure you have:
+
+- [ ] **AWS CLI** configured with appropriate credentials
+- [ ] **Terraform** >= 1.0 installed
+- [ ] **kubectl** installed and configured
+- [ ] **helm** >= 3.0 installed
+- [ ] **Docker** installed (for building images)
+- [ ] **Maven** installed (for building Java applications)
+- [ ] AWS account with permissions to:
+  - Create EKS clusters
+  - Create VPCs and subnets
+  - Create EC2 instances
+  - Create ECR repositories
+  - Create S3 buckets
+  - Create IAM roles and policies
+
+---
+
+## Infrastructure Deployment (Terraform)
+
+### Step 1: Configure Terraform Variables
+
+```bash
+cd demos/2-million-messages-per-second/high-infra/terraform
+cp terraform.tfvars.example terraform.tfvars
+# Edit terraform.tfvars with your values
+```
+
+**Key variables to configure:**
+- `aws_region` - AWS region (default: `us-west-2`)
+- `eks_cluster_name` - EKS cluster name (default: `fluss-eks-cluster`)
+- `fluss_image_repository` - ECR repository URL for Fluss image
+- `demo_image_repository` - ECR repository URL for demo image
+- `subnet_ids` - List of private subnet IDs for EC2 instances
+- `security_group_ids` - (Optional) Additional security groups
+- `key_name` - (Optional) SSH key pair name
+
+### Step 2: Build and Push Docker Images to ECR
+
+**Option A: Use Automated Script**
+
+```bash
+cd demos/2-million-messages-per-second/high-infra
+./push-images-to-ecr.sh
+```
+
+**Option B: Manual Build and Push**
+
+```bash
+# Build demo application
+cd demos/2-million-messages-per-second/fluss_flink_realtime
+mvn clean package
+docker build -t fluss-demo:latest .
+
+# Get ECR login
+AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
+AWS_REGION=us-west-2
+ECR_BASE="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com"
+aws ecr get-login-password --region ${AWS_REGION} | docker login --username AWS --password-stdin ${ECR_BASE}
+
+# Tag and push demo image
+docker tag fluss-demo:latest ${ECR_BASE}/fluss-demo:latest
+docker push ${ECR_BASE}/fluss-demo:latest
+
+# Pull, tag, and push Fluss image
+docker pull apache/fluss:0.8.0-incubating
+docker tag apache/fluss:0.8.0-incubating ${ECR_BASE}/fluss:0.8.0-incubating
+docker tag apache/fluss:0.8.0-incubating ${ECR_BASE}/fluss:latest
+docker push ${ECR_BASE}/fluss:0.8.0-incubating
+docker push ${ECR_BASE}/fluss:latest
+```
+
+### Step 3: Initialize and Apply Terraform
+
+```bash
+cd demos/2-million-messages-per-second/high-infra/terraform
+
+# Initialize Terraform
+terraform init
+
+# Review the plan
+terraform plan
+
+# Apply infrastructure
+terraform apply
+```
+
+**What gets created:**
+- EKS cluster with node groups:
+  - Coordinator nodes (1 node)
+  - Tablet server nodes (3 nodes)
+  - Flink JobManager node (1 node)
+  - Flink TaskManager nodes (6 nodes)
+  - Producer nodes (4 nodes)
+- ECR repositories (`fluss-demo`, `fluss`)
+- VPC with public/private subnets
+- EBS CSI driver (for persistent volumes)
+- S3 bucket for Flink checkpoints
+- IAM roles and policies
+
+**Wait for nodes to join the cluster:**
+```bash
+kubectl get nodes
+# Should show all nodes in Ready state
+```
+
+---
+
+## Kubernetes Configuration
+
+### Step 4: Update Kubeconfig
+
+```bash
+cd demos/2-million-messages-per-second/high-infra/terraform
+aws eks update-kubeconfig --region us-west-2 --name fluss-eks-cluster
+
+# Verify connection
+kubectl cluster-info
+kubectl get nodes
+```
+
+---
+
+## Storage Setup
+
+### Step 5: Setup Local NVMe Storage (for Tablet Servers)
+
+**IMPORTANT:** Tablet servers require NVMe storage for optimal performance.
+
+```bash
+cd demos/2-million-messages-per-second/high-infra/k8s/storage
+./setup-local-storage.sh
+```
+
+**What this script does:**
+- Creates `local-storage` StorageClass
+- Creates PersistentVolumes for tablet servers (3 PVs, 500Gi each)
+- Configures node affinity to tablet-server nodes
+- Sets up NVMe mount paths (`/opt/alldata/fluss/data`)
+
+**Verify storage setup:**
+```bash
+kubectl get storageclass local-storage
+kubectl get pv -l component=tablet-server
+kubectl get pv -l component=tablet-server -o yaml | grep -A 5 "path:"
+# Should show: path: /opt/alldata/fluss/data
+```
+
+---
+
+## Component Deployment
+
+### Step 6: Deploy All Components
+
+Deploy ZooKeeper, Fluss, Flink, and Monitoring stack:
+
+```bash
+cd demos/2-million-messages-per-second/high-infra/k8s
+
+# Get ECR repository URLs (adjust account ID and region as needed)
+DEMO_IMAGE_REPO="343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo"
+FLUSS_IMAGE_REPO="343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss"
+
+# Deploy with ECR images
+./deploy.sh fluss "${DEMO_IMAGE_REPO}" latest "${FLUSS_IMAGE_REPO}"
+```
+
+**What gets deployed:**
+1. **Namespace**: Creates `fluss` namespace
+2. **ZooKeeper**: StatefulSet with 1 replica
+3. **Fluss**:
+   - Coordinator (1 replica)
+   - Tablet Servers (3 replicas)
+   - Uses ECR image: `${FLUSS_IMAGE_REPO}:latest`
+4. **Flink**:
+   - JobManager (1 replica)
+   - TaskManager (6 replicas, 32 slots each)
+   - Uses ECR image: `${DEMO_IMAGE_REPO}:latest`
+   - JAR embedded at: `/opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+5. **Monitoring**:
+   - Prometheus (kube-prometheus-stack)
+   - Grafana
+   - ServiceMonitors and PodMonitors for metrics scraping
+6. **Grafana Dashboard**: ConfigMap with Fluss & Flink dashboard
+
+**Wait for components to be ready:**
+```bash
+# Check Fluss pods
+kubectl get pods -n fluss
+
+# Check monitoring pods
+kubectl get pods -n monitoring
+
+# Wait for all pods to be Running
+kubectl wait --for=condition=ready pod -l app=zookeeper -n fluss --timeout=120s
+kubectl wait --for=condition=ready pod -l app.kubernetes.io/component=coordinator -n fluss --timeout=300s
+kubectl wait --for=condition=ready pod -l app=flink,component=jobmanager -n fluss --timeout=120s
+```
+
+---
+
+## Storage Verification
+
+### Step 7: Verify NVMe Storage for Tablet Servers
+
+**CRITICAL:** Verify tablet server storage is using NVMe drives before proceeding.
+
+```bash
+# Verify PVs exist and are bound
+kubectl get pv -l component=tablet-server
+
+# Check PV details - should show path: /opt/alldata/fluss/data
+kubectl get pv -l component=tablet-server -o yaml | grep -A 5 "path:"
+
+# Verify PVCs are bound to PVs
+kubectl get pvc -n fluss
+
+# Check tablet server pods and their volumes
+kubectl get pods -n fluss -l app.kubernetes.io/component=tablet-server -o wide
+
+# Verify mount paths inside tablet server pods
+kubectl exec -n fluss <tablet-server-pod-name> -- df -h | grep alldata
+
+# Check that data directory exists on NVMe
+kubectl exec -n fluss <tablet-server-pod-name> -- ls -la /opt/alldata/fluss/
+```
+
+**Expected Results:**
+- PVs should show `path: /opt/alldata/fluss/data`
+- Tablet server pods should have volumes mounted at `/opt/alldata/fluss`
+- Data directory should exist: `/opt/alldata/fluss/data`
+
+---
+
+## Producer Deployment
+
+### Step 8: Create Fluss Table
+
+Before deploying producers, create the Fluss table with 128 buckets:
+
+```bash
+cd demos/2-million-messages-per-second/high-infra/k8s/jobs
+
+# Create table with 128 buckets
+export BUCKETS=128
+./create-table.sh
+```
+
+**Verify table creation:**
+```bash
+kubectl logs -n fluss -l app=create-table --tail=50
+```
+
+### Step 9: Deploy Multi-Instance Producer
+
+Deploy 8 producer instances (2 per node across 4 producer nodes) with 128 buckets:
+
+**Option A: Use Multi-Instance Script (Recommended)**
+
+```bash
+cd demos/2-million-messages-per-second/high-infra/k8s/jobs
+
+# Deploy 8 producer instances with 128 buckets
+export BUCKETS=128
+./deploy-producer-multi-instance.sh --wait
+```
+
+**Option B: Custom Parameters**
+
+```bash
+cd demos/2-million-messages-per-second/high-infra/k8s/jobs
+
+# Deploy 8 producer instances with custom parameters
+export BUCKETS=128
+export PRODUCER_RATE=250000
+export TOTAL_PRODUCERS=8
+./deploy-producer-multi-instance.sh --wait
+```
+
+**Multi-Instance Configuration (defaults):**
+- **Total Instances**: 8 (instance IDs 0-7)
+- **Distribution**: 2 pods per producer node (4 nodes total)
+- **Rate per instance**: 250,000 records/second
+- **Total rate**: 2,000,000 records/second
+- **Flush**: Every 5,000 records (optimal for throughput)
+- **Batch Timeout**: 90ms (optimal for batching)
+- **Buffer Size**: 2GB
+- **Batch Size**: 128MB
+- **Memory**: 4Gi request, 16Gi limit per instance
+- **CPU**: 2000m request, 8000m limit per instance
+- **Writer Threads**: 48 per instance
+- **Buckets**: 128 (must match table bucket count)
+
+**Monitor producers:**
+```bash
+# Check producer pod status (should see 8 pods, 2 per node)
+kubectl get pods -n fluss -l app=fluss-producer -o wide
+
+# View producer logs (all instances)
+kubectl logs -n fluss -l app=fluss-producer -f
+
+# View logs for specific instance
+kubectl logs -n fluss -l app=fluss-producer,job-name=fluss-producer-0 -f
+
+# Check pod distribution across nodes
+kubectl get pods -n fluss -l app=fluss-producer -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}{end}' | sort -k2
+
+# Check producer metrics
+kubectl port-forward -n fluss svc/fluss-producer-metrics 8080:8080
+# Then visit: http://localhost:8080/metrics
+```
+
+---
+
+## Flink Job Deployment
+
+### Step 10: Submit Flink Aggregator Job
+
+Submit the Flink job that processes sensor data:
+
+```bash
+cd demos/2-million-messages-per-second/high-infra/k8s/flink
+./submit-job-from-image.sh
+```
+
+**What this script does:**
+1. Cancels any existing Flink jobs
+2. Verifies JAR exists in Flink image at `/opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+3. Uploads JAR from the image's local filesystem to Flink cluster
+4. Submits job via REST API with:
+   - Entry class: `org.apache.fluss.benchmarks.flink.FlinkSensorAggregatorJob`
+   - Parallelism: 192 (distributed across 6 TaskManager pods)
+   - Window: 1 minute
+   - **Scan mode**: `latest` (reads from latest position, not from beginning)
+5. Configures S3 checkpoints automatically (from Terraform outputs)
+
+**Monitor Flink job:**
+```bash
+# Check Flink pods
+kubectl get pods -n fluss -l app=flink
+
+# Access Flink Web UI
+kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081
+# Then visit: http://localhost:8081
+
+# View Flink job logs
+kubectl logs -n fluss -l app=flink,component=taskmanager -f
+```
+
+**Verify S3 Checkpoints:**
+```bash
+# Get S3 bucket name from Terraform
+cd demos/2-million-messages-per-second/high-infra/terraform
+S3_BUCKET=$(terraform output -raw flink_s3_bucket_name)
+
+# Check checkpoints are being written to S3
+aws s3 ls s3://${S3_BUCKET}/flink-checkpoints/fluss-eks-cluster/ --recursive
+
+# Verify checkpoint configuration in Flink ConfigMap
+kubectl get configmap flink-config -n fluss -o yaml | grep -A 2 "state.checkpoints.dir"
+```
+
+---
+
+## Monitoring Setup
+
+### Step 11: Deploy Grafana Dashboard
+
+Deploy the Grafana dashboard for monitoring:
+
+```bash
+cd demos/2-million-messages-per-second/high-infra/k8s/monitoring
+./deploy-dashboard.sh
+```
+
+**Dashboard includes:**
+- Producer metrics (records/sec, latency)
+- Flink metrics (input/output rates, aggregator records)
+- Fluss Coordinator metrics (cluster status, table/bucket counts)
+- Fluss Tablet Server metrics (messages rate, bytes rate, replication)
+- JVM metrics (CPU, memory for coordinator and tablet servers)
+
+---
+
+## Verification
+
+### Step 12: Verify Complete Deployment
+
+**Check All Pods:**
+```bash
+# Fluss namespace
+kubectl get pods -n fluss
+
+# Expected:
+# - zk-0: Running
+# - coordinator-server-0: Running
+# - tablet-server-0, tablet-server-1, tablet-server-2: Running
+# - flink-jobmanager-*: Running
+# - flink-taskmanager-0 through flink-taskmanager-5: Running
+# - fluss-producer-*: Running (8 pods)
+
+# Monitoring namespace
+kubectl get pods -n monitoring
+```
+
+**Check Node Placement:**
+```bash
+# Verify nodes are on correct instance types
+kubectl get nodes -l node-type=coordinator
+kubectl get nodes -l node-type=tablet-server
+kubectl get nodes -l node-type=flink-jobmanager
+kubectl get nodes -l node-type=flink-taskmanager
+kubectl get nodes -l node-type=producer
+
+# Check pod placement
+kubectl get pods -n fluss -o wide
+```
+
+**Verify Metrics Scraping:**
+```bash
+# Check ServiceMonitors
+kubectl get servicemonitor -n fluss
+
+# Check PodMonitors
+kubectl get podmonitor -n fluss
+
+# Port-forward Prometheus and check targets
+kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
+# Visit: http://localhost:9090/targets
+# All targets should be "UP"
+```
+
+**Verify Data Flow:**
+```bash
+# Check producer is writing data
+kubectl logs -n fluss -l app=fluss-producer --tail=50 | grep -i "records\|throughput"
+
+# Check Flink is processing data
+kubectl logs -n fluss -l app=flink,component=taskmanager --tail=50 | grep -i "aggregate\|records"
+
+# Check Fluss tablet servers are receiving data
+kubectl logs -n fluss -l app.kubernetes.io/component=tablet-server --tail=20
+```
+
+---
+
+## Accessing Services
+
+### Flink Web UI
+```bash
+kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081
+# Open http://localhost:8081
+```
+
+### Grafana
+```bash
+# Port-forward Grafana
+kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80
+
+# Then visit: http://localhost:3000
+# Username: admin
+# Password: admin123
+```
+
+### Prometheus
+```bash
+kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
+# Visit: http://localhost:9090
+```
+
+### Producer Metrics
+```bash
+kubectl port-forward -n fluss svc/fluss-producer-metrics 8080:8080
+# Visit: http://localhost:8080/metrics
+```
+
+---
+
+## Troubleshooting
+
+### Producer Not Starting
+
+**Issue**: Producer pod fails with "Unable to access jarfile"
+
+**Solution**: Ensure JAR path is correct in `producer-job.yaml`:
+- Correct path: `/opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+- Image must be built with JAR embedded
+
+### Flink Job Submission Fails
+
+**Issue**: Cannot upload JAR or submit job
+
+**Solution**: 
+- Ensure Flink JobManager pod is Running
+- Check Flink REST API is accessible: `kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081`
+- Verify JAR exists in Flink image:
+  ```bash
+  kubectl exec -n fluss <jobmanager-pod> -- test -f /opt/flink/usrlib/fluss-flink-realtime-demo.jar
+  ```
+- If JAR is missing, rebuild and push the image:
+  ```bash
+  cd demos/2-million-messages-per-second/high-infra/k8s/flink
+  ./build-and-push.sh
+  ```
+- Then restart Flink pods to pull the new image
+
+### Metrics Not Appearing
+
+**Issue**: No metrics in Grafana/Prometheus
+
+**Solution**:
+1. Check ServiceMonitors/PodMonitors are deployed:
+   ```bash
+   kubectl get servicemonitor,podmonitor -n fluss
+   ```
+2. Verify Prometheus is scraping targets:
+   ```bash
+   kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
+   # Visit http://localhost:9090/targets
+   ```
+3. Check pod annotations:
+   ```bash
+   kubectl get pod -n fluss -o yaml | grep prometheus.io
+   ```
+
+### Pods Not Scheduling
+
+**Issue**: Pods stuck in Pending state
+
+**Solution**:
+1. Check node availability:
+   ```bash
+   kubectl get nodes
+   kubectl describe nodes
+   ```
+2. Check node selectors and tolerations match:
+   ```bash
+   kubectl describe pod <pod-name> -n fluss | grep -A 5 "Node-Selectors\|Tolerations"
+   ```
+3. Verify node labels:
+   ```bash
+   kubectl get nodes --show-labels
+   ```
+
+### Tablet Server Storage Not Using NVMe
+
+**Issue**: Tablet servers not using NVMe storage
+
+**Solution**:
+```bash
+# Check PV bindings
+kubectl get pv -l component=tablet-server
+kubectl get pvc -n fluss
+
+# Verify PV paths point to NVMe mount
+kubectl get pv fluss-tablet-data-0 -o jsonpath='{.spec.local.path}'
+# Should show: /opt/alldata/fluss/data
+
+# Check tablet server pod volumes
+kubectl describe pod <tablet-server-pod> -n fluss | grep -A 10 "Mounts:"
+
+# Verify NVMe is mounted on node
+kubectl debug node/<tablet-node> -it --image=busybox -- df -h | grep alldata
+```
+
+---
+
+## Cleanup
+
+To remove all deployed components (but keep infrastructure):
+
+```bash
+# Delete producer job
+kubectl delete job -n fluss fluss-producer
+
+# Delete Flink job (via REST API or UI)
+kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081
+# Visit http://localhost:8081 and cancel job
+
+# Delete all components
+cd demos/2-million-messages-per-second/high-infra/k8s
+kubectl delete -f flink/
+kubectl delete -f zookeeper/
+helm uninstall fluss -n fluss
+helm uninstall prometheus -n monitoring
+kubectl delete namespace fluss monitoring
+```
+
+To destroy infrastructure:
+
+```bash
+cd demos/2-million-messages-per-second/high-infra/terraform
+terraform destroy
+```
+
+---
+
+## Quick Reference Commands
+
+### View Logs
+```bash
+# Producer
+kubectl logs -n fluss -l app=fluss-producer -f
+
+# Flink JobManager
+kubectl logs -n fluss -l app=flink,component=jobmanager -f
+
+# Flink TaskManager
+kubectl logs -n fluss -l app=flink,component=taskmanager -f
+
+# Fluss Coordinator
+kubectl logs -n fluss -l app.kubernetes.io/component=coordinator -f
+
+# Fluss Tablet Servers
+kubectl logs -n fluss -l app.kubernetes.io/component=tablet-server -f
+```
+
+### Restart Components
+```bash
+# Restart producer (using optimal configuration)
+kubectl delete job -n fluss fluss-producer
+cd demos/2-million-messages-per-second/high-infra/k8s/jobs
+./deploy-producer-multi-instance.sh
+
+# Restart Flink job
+cd demos/2-million-messages-per-second/high-infra/k8s/flink
+./submit-job-from-image.sh
+```
+
+---
+
+## Automated Deployment Script
+
+For automated deployment, use the master script:
+
+```bash
+cd demos/2-million-messages-per-second/high-infra/k8s/scripts
+
+# Set environment variables
+export DEMO_IMAGE_REPO=343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo
+export FLUSS_IMAGE_REPO=343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss
+
+# Run all steps
+./deploy-benchmark.sh
+
+# Or run specific steps
+./deploy-benchmark.sh --start-from-step 5
+./deploy-benchmark.sh --only-step 6
+```
+
+---
+
+## Additional Resources
+
+- **[DEPLOY-STEPS.md](./high-infra/DEPLOY-STEPS.md)** - Detailed step-by-step deployment guide
+- **[DEPLOYMENT_FIXES.md](./high-infra/DEPLOYMENT_FIXES.md)** - Known issues and fixes
+- **[k8s/DEPLOYMENT.md](./high-infra/k8s/DEPLOYMENT.md)** - Kubernetes deployment guide
+- **[MONITORING.md](./high-infra/MONITORING.md)** - Monitoring setup and configuration
+- **[PRODUCER_CONFIG.md](./high-infra/k8s/jobs/PRODUCER_CONFIG.md)** - Producer configuration guide
+- **[README.md](./README.md)** - Benchmark overview and architecture
+
+---
+
+## Notes
+
+- **ECR Images**: Ensure images are pushed to ECR before deployment
+- **Storage**: Local NVMe storage setup is required for tablet servers if using persistence
+- **Node Types**: All components are configured to run on specific node groups:
+  - Coordinator: c5.2xlarge
+  - Tablet Servers: i7i.8xlarge (with NVMe)
+  - Flink JobManager: c5.4xlarge
+  - Flink TaskManager: c5.4xlarge
+  - Producer: c5.2xlarge
+- **Parallelism**: Flink job is configured with parallelism 192 (32 slots per TaskManager × 6 TaskManagers)
+- **Scan Mode**: Flink job reads from `latest` position (configured via SQL hint `scan.startup.mode = 'latest'`), meaning it only processes new data and doesn't read historical data from the beginning
+- **Buckets**: Table must be created with 128 buckets to match producer configuration
+
diff --git a/e2e-iot/high-infra/MONITORING.md b/e2e-iot/high-infra/MONITORING.md
new file mode 100644
index 0000000..706ac13
--- /dev/null
+++ b/e2e-iot/high-infra/MONITORING.md
@@ -0,0 +1,228 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Fluss & Flink Monitoring Setup
+
+This document describes the monitoring infrastructure for the Fluss deployment, including Prometheus metrics collection and Grafana dashboards.
+
+## Overview
+
+The monitoring stack includes:
+- **Prometheus**: Metrics collection and storage
+- **Grafana**: Visualization and dashboards
+- **ServiceMonitors/PodMonitors**: Automatic service discovery for Prometheus scraping
+
+## Components Monitored
+
+### 1. Fluss Coordinator & Tablet Servers
+- **Metrics Port**: 9249
+- **Metrics Path**: `/metrics`
+- **Metrics Exposed**: 
+  - Request rates
+  - Latency metrics
+  - Write operations
+  - Client connections
+
+### 2. Flink Aggregator Job
+- **Metrics Port**: 9249
+- **Metrics Path**: `/metrics`
+- **Metrics Exposed**:
+  - Input records rate (`flink_taskmanager_job_task_operator_numRecordsIn`)
+  - Output records rate (`flink_taskmanager_job_task_operator_numRecordsOut`)
+  - Consumer lag (`flink_taskmanager_job_task_operator_fluss_consumer_lag`)
+  - Backpressure metrics
+  - Throughput metrics
+
+### 3. Producer Job
+- **Metrics Port**: 8080 (if custom metrics endpoint is added)
+- **Metrics Path**: `/metrics`
+- **Metrics Exposed**:
+  - Records written (`fluss_client_writer_records_total`)
+  - Write latency (`fluss_client_writer_sendLatencyMs`)
+  - Write rate
+
+## Accessing Grafana
+
+### Option 1: Port Forward (Recommended for Development)
+```bash
+kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80
+```
+Then access: http://localhost:3000
+- Username: `admin`
+- Password: `admin123`
+
+### Option 2: LoadBalancer (if configured)
+```bash
+kubectl get svc -n monitoring prometheus-grafana
+```
+Access the external LoadBalancer URL on port 80.
+
+## Accessing Prometheus
+
+### Port Forward
+```bash
+kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
+```
+Then access: http://localhost:9090
+
+## Grafana Dashboards
+
+### Fluss & Flink Monitoring Dashboard
+The main dashboard includes:
+
+1. **Producer Metrics**
+   - Records rate (per second)
+   - Total records written
+   - Write latency (p95, p99)
+
+2. **Flink Aggregator Metrics**
+   - Input records rate
+   - Output records rate
+   - Consumer lag (with alerting)
+   - Backpressure metrics
+
+3. **Fluss Server Metrics**
+   - Coordinator request rate
+   - Tablet server write rate
+
+## Key Metrics to Monitor
+
+### Producer Metrics
+- `fluss_client_writer_records_total`: Total records written
+- `fluss_client_writer_sendLatencyMs`: Write latency histogram
+- Rate: `rate(fluss_client_writer_records_total[5m])`
+
+### Flink Aggregator Metrics
+- `flink_taskmanager_job_task_operator_numRecordsIn`: Input records
+- `flink_taskmanager_job_task_operator_numRecordsOut`: Output records
+- `flink_taskmanager_job_task_operator_fluss_consumer_lag`: Consumer lag
+- Rate: `rate(flink_taskmanager_job_task_operator_numRecordsIn[5m])`
+
+### Consumer Lag Alert
+The dashboard includes an alert for high consumer lag (>10,000 records). This indicates the Flink aggregator is falling behind the producer.
+
+## Prometheus Queries
+
+### Producer Throughput
+```promql
+sum(rate(fluss_client_writer_records_total[5m])) by (instance)
+```
+
+### Flink Input Rate
+```promql
+sum(rate(flink_taskmanager_job_task_operator_numRecordsIn[5m])) by (job_name, task_name)
+```
+
+### Consumer Lag
+```promql
+flink_taskmanager_job_task_operator_fluss_consumer_lag
+```
+
+### Producer Latency (p95)
+```promql
+histogram_quantile(0.95, sum(rate(fluss_client_writer_sendLatencyMs_bucket[5m])) by (le, instance))
+```
+
+## Configuration
+
+### Fluss Prometheus Metrics
+Configured in `helm-charts/fluss/values.yaml`:
+```yaml
+configurationOverrides:
+  metrics.reporter.prometheus.class: org.apache.fluss.metrics.prometheus.PrometheusReporterPlugin
+  metrics.reporter.prometheus.port: "9249"
+  metrics.reporter: prometheus
+```
+
+### Flink Prometheus Metrics
+Configured via environment variables in `jobs.tf`:
+```hcl
+env {
+  name  = "FLINK_METRICS_REPORTERS"
+  value = "prom"
+}
+env {
+  name  = "FLINK_METRICS_REPORTER_PROM_CLASS"
+  value = "org.apache.flink.metrics.prometheus.PrometheusReporter"
+}
+env {
+  name  = "FLINK_METRICS_REPORTER_PROM_PORT"
+  value = "9249"
+}
+```
+
+## Troubleshooting
+
+### Metrics Not Appearing
+1. Check if pods have Prometheus annotations:
+   ```bash
+   kubectl get pods -n <namespace> -o yaml | grep prometheus.io
+   ```
+
+2. Check ServiceMonitor/PodMonitor resources:
+   ```bash
+   kubectl get servicemonitor -n <namespace>
+   kubectl get podmonitor -n <namespace>
+   ```
+
+3. Check Prometheus targets:
+   - Access Prometheus UI
+   - Go to Status > Targets
+   - Verify all targets are "UP"
+
+### Flink Metrics Not Available
+If Flink metrics are not appearing, ensure:
+1. Flink Prometheus reporter JAR is in the classpath
+2. Environment variables are set correctly
+3. Port 9249 is accessible from Prometheus
+
+### Producer Metrics Not Available
+The producer is a standalone Java application. To expose metrics:
+1. Add a simple HTTP metrics endpoint
+2. Use Fluss client metrics (if available)
+3. Export metrics via JMX and use JMX exporter
+
+## Custom Metrics
+
+To add custom metrics to the producer:
+1. Use a metrics library (Micrometer, Prometheus client)
+2. Expose metrics on an HTTP endpoint
+3. Update the ServiceMonitor/PodMonitor to scrape the endpoint
+
+Example using Micrometer:
+```java
+MeterRegistry registry = new PrometheusMeterRegistry(PrometheusConfig.DEFAULT);
+Counter recordsCounter = Counter.builder("producer.records.total")
+    .description("Total records produced")
+    .register(registry);
+```
+
+## Resources
+
+- Prometheus: 2Gi memory, 1 CPU (requests)
+- Grafana: Included in kube-prometheus-stack
+- Retention: 30 days
+
+## Security Notes
+
+⚠️ **Production Recommendations**:
+- Change default Grafana password
+- Use TLS for Grafana and Prometheus
+- Restrict access via network policies
+- Use RBAC for Prometheus access
+
diff --git a/e2e-iot/high-infra/PRODUCER_THROUGHPUT_DROP_ISSUE.md b/e2e-iot/high-infra/PRODUCER_THROUGHPUT_DROP_ISSUE.md
new file mode 100644
index 0000000..d221dd3
--- /dev/null
+++ b/e2e-iot/high-infra/PRODUCER_THROUGHPUT_DROP_ISSUE.md
@@ -0,0 +1,174 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Producer Throughput Drop Issue - Root Cause Analysis
+
+## Problem Summary
+
+**Date:** December 7, 2025  
+**Issue:** Producer throughput dropped from 2M ops/sec to 1.4M ops/sec (and continued declining to ~0.92M ops/sec)  
+**Status:** Resolved
+
+## Symptoms
+
+1. **Throughput Degradation:**
+   - Expected: 2M ops/sec (8 producers × 250K ops/sec each)
+   - Actual: Started at 1.4M ops/sec, dropped to 1.17M ops/sec, then to 0.92M ops/sec
+   - Each producer achieving only ~145-150K ops/sec instead of 250K ops/sec
+
+2. **Error Messages in Producer Logs:**
+   ```
+   [fluss-write-sender-thread-1] WARN org.apache.fluss.client.write.Sender - Get error write response on table bucket TableBucket{tableId=0, bucket=X}, retrying (2147482754 attempts left). Error: LEADER_NOT_AVAILABLE_EXCEPTION. Error Message: Server -1 is not found in metadata cache.
+   ```
+
+3. **DNS Resolution Errors:**
+   ```
+   org.apache.fluss.exception.NetworkException: Disconnected from node tablet-server-X.tablet-server-hs.fluss.svc.cluster.local:9124
+   Caused by: java.net.UnknownHostException: tablet-server-X.tablet-server-hs.fluss.svc.cluster.local
+   ```
+
+4. **Coordinator Metrics:**
+   - Initially showed: `activeTabletServerCount: 0`, `bucketCount: 0`
+   - After restart: Eventually showed correct values but metadata cache was stale
+
+## Root Cause
+
+**Stale Metadata Cache in Fluss Coordinator**
+
+The issue occurred after Fluss tablet servers were restarted:
+
+1. **Timeline:**
+   - Tablet servers were restarted (tablet-server-2 was newly started ~2 minutes before the issue)
+   - Coordinator was restarted to refresh metadata
+   - Coordinator initialized and discovered tablet servers via ZooKeeper
+   - However, the coordinator's metadata cache became stale/inconsistent
+
+2. **Why It Happened:**
+   - When tablet servers restart, their registration in ZooKeeper changes
+   - Coordinator reads tablet server information from ZooKeeper during initialization
+   - But the coordinator's internal metadata cache (bucket-to-leader mappings) was not properly refreshed
+   - Producers query the coordinator for bucket leader information
+   - Coordinator returned stale metadata (Server -1 = invalid/not found)
+   - Producers couldn't find bucket leaders → retries → reduced throughput
+
+3. **Why ZooKeeper Was NOT the Issue:**
+   - ZooKeeper was running fine
+   - Coordinator successfully registered with ZooKeeper
+   - Tablet servers were properly registered in ZooKeeper
+   - The issue was in the coordinator's internal metadata cache, not ZooKeeper
+
+## Investigation Steps
+
+1. **Checked Producer Metrics:**
+   - Each producer showing ~145-150K ops/sec instead of 250K
+   - Total throughput: ~1.17M ops/sec (should be 2M)
+
+2. **Checked Producer Logs:**
+   - Found `LEADER_NOT_AVAILABLE_EXCEPTION` errors
+   - "Server -1 is not found in metadata cache" errors
+
+3. **Checked Fluss Coordinator:**
+   - Initially showed 0 active tablet servers, 0 buckets
+   - After restart, showed 3 tablet servers, 48 buckets, but metadata cache was stale
+
+4. **Checked Tablet Servers:**
+   - All 3 tablet servers running
+   - `produceLog` requests showing 0.0 ops/sec (not receiving writes)
+
+5. **Identified the Issue:**
+   - Coordinator metadata cache was stale
+   - Producers couldn't get valid bucket leader information
+
+## Solution
+
+### Step 1: Recreate the Table
+Recreated the Fluss table to force metadata refresh:
+
+```bash
+cd aws-deploy-fluss/high-infra/k8s/jobs
+./create-table.sh \
+  --namespace fluss \
+  --bootstrap coordinator-server-hs.fluss.svc.cluster.local:9124 \
+  --database iot \
+  --table sensor_readings \
+  --buckets 48 \
+  --image-repo 343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo \
+  --image-tag latest
+```
+
+This:
+- Dropped the existing table
+- Recreated it with 48 buckets
+- Forced coordinator to rebuild bucket-to-leader mappings
+
+### Step 2: Restart Producers
+Restarted all producer jobs to get fresh metadata:
+
+```bash
+kubectl delete jobs -n fluss -l app=fluss-producer
+cd aws-deploy-fluss/high-infra/k8s/jobs
+TOTAL_PRODUCERS=8 PRODUCER_RATE=250000 ./deploy-producer-multi-instance.sh --wait
+```
+
+This:
+- Deleted existing producer jobs
+- Redeployed 8 producer instances
+- Producers connected with fresh metadata cache
+
+### Result
+- Throughput recovered to expected ~2M ops/sec
+- All producers achieving target 250K ops/sec each
+- No more `LEADER_NOT_AVAILABLE_EXCEPTION` errors
+
+## Prevention & Recommendations
+
+1. **When Restarting Fluss Components:**
+   - If restarting tablet servers, wait for coordinator to fully initialize before restarting producers
+   - Monitor coordinator metrics: `fluss_coordinator_activeTabletServerCount`, `fluss_coordinator_bucketCount`
+   - Ensure these metrics show correct values before deploying producers
+
+2. **If Throughput Drops:**
+   - Check producer logs for `LEADER_NOT_AVAILABLE_EXCEPTION` errors
+   - Check coordinator metrics for tablet server and bucket counts
+   - If metadata is stale, recreate the table and restart producers
+
+3. **Monitoring:**
+   - Set up alerts for:
+     - Producer throughput below expected threshold
+     - `LEADER_NOT_AVAILABLE_EXCEPTION` errors in producer logs
+     - Coordinator showing 0 tablet servers or 0 buckets
+
+4. **Best Practices:**
+   - Avoid restarting multiple Fluss components simultaneously
+   - Restart order: ZooKeeper → Coordinator → Tablet Servers → Producers
+   - Wait for each component to be fully ready before proceeding
+
+## Related Files
+
+- Producer deployment script: `aws-deploy-fluss/high-infra/k8s/jobs/deploy-producer-multi-instance.sh`
+- Table creation script: `aws-deploy-fluss/high-infra/k8s/jobs/create-table.sh`
+- Producer job manifest: `aws-deploy-fluss/high-infra/k8s/jobs/producer-job.yaml`
+
+## Key Takeaways
+
+- **Not a ZooKeeper issue** - ZooKeeper was functioning correctly
+- **Coordinator metadata cache** - The coordinator's internal cache can become stale after restarts
+- **Solution is straightforward** - Recreate table and restart producers
+- **Prevention** - Proper restart order and monitoring can prevent this issue
+
+
diff --git a/e2e-iot/high-infra/README.md b/e2e-iot/high-infra/README.md
new file mode 100644
index 0000000..59da697
--- /dev/null
+++ b/e2e-iot/high-infra/README.md
@@ -0,0 +1,313 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# AWS Fluss Deployment with Terraform
+
+This directory contains Terraform configurations to deploy Fluss, producer, and Flink consumer on AWS EKS.
+
+## Prerequisites
+
+1. **AWS CLI** configured with appropriate credentials
+2. **Terraform** >= 1.0
+3. **kubectl** configured to access your EKS cluster
+4. **Helm** >= 3.0
+5. An existing **EKS cluster** (or uncomment EKS module in `main.tf`)
+
+## Directory Structure
+
+```
+low-infra/
+├── terraform/           # Terraform configurations
+│   ├── main.tf         # Main Terraform configuration
+│   ├── variables.tf    # Variable definitions
+│   ├── outputs.tf      # Output values
+│   ├── zookeeper.tf    # ZooKeeper deployment
+│   ├── fluss.tf        # Fluss Helm deployment
+│   ├── jobs.tf         # Producer and Flink consumer jobs
+│   ├── ecr.tf          # ECR repository for demo app
+│   └── terraform.tfvars.example
+├── helm-charts/        # Helm chart values
+│   └── fluss-values.yaml
+└── manifests/          # Additional Kubernetes manifests (if needed)
+```
+
+## Setup
+
+### 1. Configure Terraform Variables
+
+```bash
+cd terraform
+cp terraform.tfvars.example terraform.tfvars
+# Edit terraform.tfvars with your values
+```
+
+**Important**: Configure the following for EC2 instances:
+- `subnet_ids`: List of private subnet IDs where EC2 instances will be launched
+- `security_group_ids`: (Optional) Additional security groups for EC2 instances
+- `key_name`: (Optional) SSH key pair name for EC2 access
+- Instance types and counts for coordinator and tablet servers
+
+### 2. Build and Push Images to ECR
+
+Use the provided script to build and push all images (demo app and Fluss):
+
+```bash
+# Make sure AWS CLI is configured
+aws configure
+
+# Run the push script (it will create ECR repos if they don't exist)
+./push-images-to-ecr.sh
+```
+
+This script will:
+1. Create ECR repositories (if they don't exist)
+2. Build the demo application image (fluss-demo)
+3. Push demo image to ECR
+4. Pull Apache Fluss image from Docker Hub
+5. Push Fluss image to ECR
+
+After running, the script will display the ECR URLs. These are automatically configured in `terraform/terraform.tfvars` if you're using the default AWS account and region. Otherwise, update `terraform.tfvars` with your ECR URLs.
+
+Alternatively, you can manually build and push:
+
+```bash
+# Build demo app
+cd ../../demos/demo/fluss_flink_realtime_demo
+mvn clean package
+docker build -t fluss-demo:latest .
+
+# Get ECR login
+AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
+AWS_REGION=us-west-2
+ECR_BASE="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com"
+aws ecr get-login-password --region ${AWS_REGION} | docker login --username AWS --password-stdin ${ECR_BASE}
+
+# Tag and push demo image
+docker tag fluss-demo:latest ${ECR_BASE}/fluss-demo:latest
+docker push ${ECR_BASE}/fluss-demo:latest
+
+# Pull, tag, and push Fluss image
+docker pull apache/fluss:0.8.0-incubating
+docker tag apache/fluss:0.8.0-incubating ${ECR_BASE}/fluss:0.8.0-incubating
+docker tag apache/fluss:0.8.0-incubating ${ECR_BASE}/fluss:latest
+docker push ${ECR_BASE}/fluss:0.8.0-incubating
+docker push ${ECR_BASE}/fluss:latest
+```
+
+### 3. Initialize Terraform
+
+```bash
+cd terraform
+terraform init
+```
+
+### 4. Plan Deployment
+
+```bash
+terraform plan
+```
+
+### 5. Apply Configuration
+
+```bash
+terraform apply
+```
+
+## What Gets Deployed
+
+1. **ECR Repositories**: 
+   - `fluss-demo`: For demo application (used by producer and Flink aggregator)
+   - `fluss`: For Apache Fluss image
+2. **EBS CSI Driver** (if `install_ebs_csi_driver = true`):
+   - EKS addon for gp3 PersistentVolume support
+   - IAM role with required permissions
+   - Required if `enable_persistence = true`
+3. **EC2 Instances** (Dedicated nodes):
+   - Coordinator nodes: Labeled with `fluss-component=coordinator`
+   - Tablet server nodes: Labeled with `fluss-component=tablet-server`
+   - Each instance type and count is configurable
+   - Root volumes: gp3, 100GB for tablet servers, 50GB for coordinators
+4. **IAM Roles**: 
+   - For EC2 instances to join EKS cluster
+   - For EBS CSI driver (if installed)
+5. **Kubernetes Namespace**: `fluss` namespace
+6. **ZooKeeper**: StatefulSet with headless service
+7. **Fluss**: Deployed via Helm chart
+   - Coordinator server (StatefulSet) - scheduled on coordinator nodes only
+   - Tablet servers (StatefulSet) - scheduled on tablet-server nodes only
+   - Services and ConfigMaps
+   - Uses ECR image if `use_ecr_for_fluss = true`
+   - Node selectors ensure pods run on dedicated nodes
+8. **Producer Job**: Kubernetes Job that writes sensor data to Fluss
+9. **Flink Aggregator Job**: Kubernetes Job that processes and aggregates data
+
+## Configuration
+
+### Fluss Configuration
+
+The Fluss Helm chart is configured via `helm-charts/fluss-values.yaml`. Key settings:
+
+- **Persistence**: 
+  - `enable_persistence = false`: Tablet servers write to `/tmp/fluss/data` on the EC2 root volume (gp3)
+  - `enable_persistence = true`: Creates separate EBS volumes (PersistentVolumes) for each tablet server pod
+- **Replicas**: Number of coordinator and tablet server replicas
+- **Storage**: Storage class and size for persistent volumes (only used if `enable_persistence = true`)
+- **ZooKeeper**: Connection to ZooKeeper service
+
+**Note**: 
+- With `enable_persistence = false`, tablet servers use the root gp3 volume of the EC2 instances. The root volume is configured with 100GB for tablet servers and 50GB for coordinators.
+- The EBS CSI driver is automatically installed by Terraform (if `install_ebs_csi_driver = true`) to support gp3 PersistentVolumes. This is required if you enable persistence later.
+
+### Jobs Configuration
+
+The producer and Flink aggregator jobs are configured in `jobs.tf`:
+
+- **Producer**: Writes sensor data at configurable rate
+- **Flink Aggregator**: Processes data with 1-minute windows
+
+## Accessing Fluss
+
+After deployment, you can access Fluss coordinator:
+
+```bash
+# Port forward to coordinator
+kubectl port-forward -n fluss svc/coordinator-server-hs 9124:9124
+
+# In another terminal, test connection
+java -cp target/fluss-flink-realtime-demo.jar \
+  org.apache.fluss.benchmarks.inspect.FlussMetadataInspector localhost:9124
+```
+
+## Monitoring
+
+Check pod status:
+
+```bash
+kubectl get pods -n fluss
+```
+
+View logs:
+
+```bash
+# Producer logs
+kubectl logs -n fluss -l app=fluss-producer --tail=50 -f
+
+# Flink aggregator logs
+kubectl logs -n fluss -l app=flink-aggregator --tail=50 -f
+
+# Fluss coordinator logs
+kubectl logs -n fluss coordinator-server-0 --tail=50 -f
+```
+
+## Cleanup
+
+To destroy all resources:
+
+```bash
+terraform destroy
+```
+
+**Note**: This will delete:
+- ECR repository and all images
+- All Kubernetes resources (Fluss, ZooKeeper, Jobs)
+- Namespace
+
+## Troubleshooting
+
+### Jobs Not Starting
+
+1. Check if Fluss coordinator is ready:
+   ```bash
+   kubectl get pods -n fluss
+   kubectl logs -n fluss coordinator-server-0
+   ```
+
+2. Verify image is accessible:
+   ```bash
+   kubectl describe job -n fluss fluss-producer
+   ```
+
+3. Check init container logs:
+   ```bash
+   kubectl logs -n fluss <pod-name> -c wait-for-fluss
+   ```
+
+### Image Pull Errors
+
+Ensure:
+1. ECR repository exists and image is pushed
+2. EKS nodes have IAM role with ECR read permissions
+3. Image tag matches `demo_image_tag` in `terraform.tfvars`
+
+### Fluss Not Starting
+
+1. Check ZooKeeper is running:
+   ```bash
+   kubectl get pods -n fluss -l app=zookeeper
+   ```
+
+2. Check Fluss logs:
+   ```bash
+   kubectl logs -n fluss coordinator-server-0
+   kubectl logs -n fluss tablet-server-0
+   ```
+
+## Customization
+
+### Add EKS Cluster Creation
+
+If you need to create the EKS cluster, uncomment and configure the EKS module in `main.tf` or add:
+
+```hcl
+module "eks" {
+  source = "terraform-aws-modules/eks/aws"
+  # ... configuration
+}
+```
+
+### Modify Resource Limits
+
+Edit `jobs.tf` to adjust CPU/memory requests and limits for producer and aggregator jobs.
+
+### Change Fluss Configuration
+
+Modify `helm-charts/fluss-values.yaml` or add additional values in `fluss.tf`.
+
+### Node Scheduling
+
+The Helm chart is configured with node selectors and affinity rules:
+- **Coordinator pods** will only run on nodes labeled with `fluss-component=coordinator`
+- **Tablet server pods** will only run on nodes labeled with `fluss-component=tablet-server`
+- Coordinator nodes have a taint (`NoSchedule`) to prevent other pods from running on them
+
+This ensures:
+- Coordinator runs on dedicated EC2 instances
+- Tablet servers run on dedicated EC2 instances
+- No resource contention between components
+
+### EC2 Instance Configuration
+
+Edit `nodes.tf` or set variables in `terraform.tfvars`:
+- `coordinator_instance_type`: EC2 instance type for coordinator (default: t3.medium)
+- `tablet_server_instance_type`: EC2 instance type for tablet servers (default: t3.medium)
+- `coordinator_instance_count`: Number of coordinator instances (default: 1)
+- `tablet_server_instance_count`: Number of tablet server instances (default: 3)
+- `subnet_ids`: List of subnet IDs where instances will be launched (required)
+- `key_name`: SSH key pair name for EC2 access (optional)
+- `security_group_ids`: Additional security groups (optional)
+
diff --git a/e2e-iot/high-infra/download-helm-chart.sh b/e2e-iot/high-infra/download-helm-chart.sh
new file mode 100755
index 0000000..0559122
--- /dev/null
+++ b/e2e-iot/high-infra/download-helm-chart.sh
@@ -0,0 +1,55 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -euo pipefail
+
+# Script to download Fluss Helm chart
+# The chart will be downloaded and extracted to helm-charts/fluss directory
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+HELM_CHARTS_DIR="${SCRIPT_DIR}/helm-charts"
+FLUSS_VERSION=${FLUSS_VERSION:-0.8.0-incubating}
+CHART_URL="https://downloads.apache.org/incubator/fluss/helm-chart/fluss-${FLUSS_VERSION}.tgz"
+
+echo "Downloading Fluss Helm chart version ${FLUSS_VERSION}..."
+
+# Create helm-charts directory if it doesn't exist
+mkdir -p "${HELM_CHARTS_DIR}"
+
+# Download chart
+TEMP_DIR=$(mktemp -d)
+trap "rm -rf ${TEMP_DIR}" EXIT
+
+cd "${TEMP_DIR}"
+curl -L -o "fluss-${FLUSS_VERSION}.tgz" "${CHART_URL}"
+
+# Extract chart
+tar -xzf "fluss-${FLUSS_VERSION}.tgz"
+
+# Copy to helm-charts directory
+if [ -d "fluss" ]; then
+    rm -rf "${HELM_CHARTS_DIR}/fluss"
+    cp -r "fluss" "${HELM_CHARTS_DIR}/"
+    echo "✓ Fluss Helm chart extracted to ${HELM_CHARTS_DIR}/fluss"
+else
+    echo "Error: Chart extraction failed"
+    exit 1
+fi
+
+echo "Chart download complete!"
+
diff --git a/e2e-iot/high-infra/helm-charts/fluss-values.yaml b/e2e-iot/high-infra/helm-charts/fluss-values.yaml
new file mode 100644
index 0000000..dd92632
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss-values.yaml
@@ -0,0 +1,79 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+persistence:
+  enabled: ${enable_persistence}
+  storageClass: ${storage_class}
+  size: ${storage_size}
+  # Set to true to use local NVMe storage (requires local-storage StorageClass and PVs)
+  # When true, storageClass should be set to "local-storage"
+  local_storage: ${local_storage:-false}
+
+image:
+  # If using ECR, registry should be empty and repository should be the full ECR URL
+  # If using Docker Hub, registry should be "docker.io" and repository should be "apache/fluss"
+  registry: ${fluss_image_registry}
+  repository: ${fluss_image_repository}
+  tag: ${fluss_version}
+
+coordinator:
+  replicas: ${coordinator_replicas}
+  nodeSelector:
+    fluss-component: coordinator
+  tolerations:
+    - key: "fluss-component"
+      operator: "Equal"
+      value: "coordinator"
+      effect: "NoSchedule"
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+          - matchExpressions:
+              - key: fluss-component
+                operator: In
+                values:
+                  - coordinator
+
+tabletServer:
+  replicas: ${tablet_server_replicas}
+  nodeSelector:
+    fluss-component: tablet-server
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+          - matchExpressions:
+              - key: fluss-component
+                operator: In
+                values:
+                  - tablet-server
+
+configurationOverrides:
+  "zookeeper.address": zk-svc.${namespace}.svc.cluster.local:2181
+  # Log retention/purging configuration
+  # table.log.ttl: Time to live for log segments (default: 7 days)
+  #   - Set to -1 to disable log deletion (keep all segments)
+  #   - Set to longer duration (e.g., "30d") to retain logs longer
+  #   - Format: Duration string (e.g., "7d", "168h", "604800s")
+  #   - This prevents Flink offset out-of-range errors when segments are purged
+  # "table.log.ttl": "-1"  # Uncomment to disable log deletion
+  # "table.log.ttl": "30d"  # Uncomment to retain logs for 30 days
+  # log.segment.file-size: Size of each log segment file (default: 1024m)
+  #   - Larger segments = fewer files but less granular retention control
+  # "log.segment.file-size": "1024m"
+
diff --git a/e2e-iot/high-infra/helm-charts/fluss/.helmignore b/e2e-iot/high-infra/helm-charts/fluss/.helmignore
new file mode 100644
index 0000000..0e8a0eb
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/e2e-iot/high-infra/helm-charts/fluss/Chart.yaml b/e2e-iot/high-infra/helm-charts/fluss/Chart.yaml
new file mode 100644
index 0000000..84e7e0a
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/Chart.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: v2
+appVersion: 0.8.0-incubating
+description: A Helm chart for Kubernetes to deploy Apache Fluss CoordinatorServer
+  and TabletServers.
+home: https://fluss.apache.org
+icon: https://fluss.apache.org/img/logo/svg/colored_logo.svg
+maintainers:
+- name: Apache Fluss Community
+  url: https://github.com/apache/fluss
+name: fluss
+type: application
+version: 0.8.0-incubating
diff --git a/e2e-iot/high-infra/helm-charts/fluss/DISCLAIMER b/e2e-iot/high-infra/helm-charts/fluss/DISCLAIMER
new file mode 100644
index 0000000..ddc4f01
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/DISCLAIMER
@@ -0,0 +1,10 @@
+Apache Fluss (incubating) is an effort undergoing incubation at The Apache
+Software Foundation (ASF), sponsored by the Apache Incubator PMC.
+
+Incubation is required of all newly accepted projects until a further review
+indicates that the infrastructure, communications, and decision making process
+have stabilized in a manner consistent with other successful ASF projects.
+
+While incubation status is not necessarily a reflection of the completeness
+or stability of the code, it does indicate that the project has yet to be
+fully endorsed by the ASF.
\ No newline at end of file
diff --git a/e2e-iot/high-infra/helm-charts/fluss/LICENSE b/e2e-iot/high-infra/helm-charts/fluss/LICENSE
new file mode 100644
index 0000000..f49a4e1
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/e2e-iot/high-infra/helm-charts/fluss/NOTICE b/e2e-iot/high-infra/helm-charts/fluss/NOTICE
new file mode 100644
index 0000000..3c6de31
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/NOTICE
@@ -0,0 +1,24 @@
+Apache Fluss (incubating)
+Copyright 2025 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+Apache Flink
+Copyright 2014-2024 The Apache Software Foundation
+
+Apache Kafka
+Copyright 2012-2024 The Apache Software Foundation
+
+Apache Paimon
+Copyright 2023-2024 The Apache Software Foundation
+
+----------------------------------------------------------
+
+This project includes code from Project LightProto, developed at Splunk,
+with the following copyright notice:
+
+LightProto
+Copyright 2020 Splunk Inc.
+
+----------------------------------------------------------
\ No newline at end of file
diff --git a/e2e-iot/high-infra/helm-charts/fluss/README.md b/e2e-iot/high-infra/helm-charts/fluss/README.md
new file mode 100644
index 0000000..3fcdc13
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/README.md
@@ -0,0 +1,110 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+
+# Fluss Helm Chart
+
+This chart deploys an Apache Fluss cluster on Kubernetes, following Helm best practices.
+It requires a Zookeeper ensemble to be running in the same Kubernetes cluster. In future releases, we may add support for an embedded Zookeeper cluster.
+
+
+## Development environment 
+
+| component                                                                      | version |
+| ------------------------------------------------------------------------------ | ------- |
+| [Docker](https://docs.docker.com/)                                             | v28.3.2 |
+| [Minikube](https://minikube.sigs.k8s.io/docs/)                                 | v1.36.0 |
+| [Kubernetes](https://kubernetes.io)                                            | v1.25.3 |
+| [Helm](https://helm.sh)                                                        | v3.18.6 |
+| [Apache Fluss](https://fluss.apache.org/docs/)                                 | v0.8.0-incubating  |
+
+
+## Image requirements 
+
+A container image for Fluss is available on DockerHub as `fluss/fluss`. You can use it directly or build your own from this repo. To use your own image you need to build the project with [Maven](https://fluss.apache.org/community/dev/building/) and build it with Docker.
+
+The Maven build will create all required artifacts in the `build-target` directory. You need to copy it into the `docker` directory. The Dockerfile in this directory will copy these artifacts into the image.
+
+In minikube, you can use the local Docker daemon to build the image without pushing it to a registry:
+
+```bash
+eval $(minikube -p minikube docker-env)
+docker build -t fluss/fluss:0.8.0-incubating .
+```
+
+## Overview
+
+It creates:
+- 1x CoordinatorServer as a StatefulSet with a headless Service (stable per‑pod DNS)
+- 3x TabletServers as a StatefulSet with a headless Service (stable per‑pod DNS)
+- ConfigMap for server.yaml (CoordinatorServer and TabletServers) to override default Fluss configuration
+- Optional PersistentVolumes for data directories
+
+## Quick start
+
+1) ZooKeeper deployment:
+
+To start Zookeeper use Bitnami’s chart or your own deployment. If you have an existing Zookeeper cluster, you can skip this step. Example with Bitnami’s chart:
+
+```bash
+helm repo add bitnami https://charts.bitnami.com/bitnami
+helm repo update
+helm install zk bitnami/zookeeper \
+  --set replicaCount=3 \
+  --set auth.enabled=false \
+  --set persistence.size=5Gi
+```
+
+2) Default (Zookeeper available in-cluster):
+
+```bash
+helm install fluss ./fluss-helm
+```
+With an optional namespace flag `--namespace <your-namespace>` if you want to install it in a specific namespace.
+
+This assumes, that Zookeeper is reachable at `zk-zookeeper.<your-namespace>.svc.cluster.local:2181`. If your Zookeeper address is different, you can override it with:
+
+```bash
+helm install fluss ./fluss-helm \
+  --set zookeeper.address=<your-zk-address>
+```
+
+## Configuration reference
+
+Important Fluss options surfaced by the chart:
+- zookeeper.address: CoordinatorServer and TabletServer point to your ZK ensemble.
+- data.dir, remote.data.dir: Local persistent path for data; remote path for snapshots (OSS/HDFS). TabletServers default to a PVC mounted at data.dir.
+- bind.listeners: Where the server actually binds.
+- advertised.listeners: Externally advertised endpoints for clients and intra‑cluster communication. In K8s, advertise stable names.
+- internal.listener.name: Which listener is used for internal communication (defaults to INTERNAL).
+- tablet-server.id: Required to be unique per TabletServer. The chart auto‑derives this from the StatefulSet pod ordinal at runtime.
+
+
+### Zookeeper and storage
+- zookeeper.address must point to a reachable ensemble.
+- data.dir defaults to /tmp/fluss/data; use a PVC if persistence.enabled=true.
+
+## Resource management
+
+Set resources with requests/limits as appropriate for production. There are no defaults to make it also run on environments with little resources such as Minikube.
+
+## Troubleshooting
+- Image pull errors:
+  - If using a private registry, configure image.pullSecrets and ensure the image repository/tag are correct.
+- Pods not ready: ensure ZooKeeper is reachable and ports 9123 are open.
+- Connection failures: check advertised.listeners configuration and DNS resolution within the cluster by using kubectl exec to get a shell in a pod and test connectivity (using nc).
\ No newline at end of file
diff --git a/e2e-iot/high-infra/helm-charts/fluss/templates/_helpers.tpl b/e2e-iot/high-infra/helm-charts/fluss/templates/_helpers.tpl
new file mode 100644
index 0000000..79ae9d3
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/templates/_helpers.tpl
@@ -0,0 +1,66 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "fluss.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+*/}}
+{{- define "fluss.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "fluss.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "fluss.labels" -}}
+helm.sh/chart: {{ printf "%s-%s" .Chart.Name (.Chart.Version | replace "+" "_") | quote }}
+app.kubernetes.io/name: {{ include "fluss.name" . | quote }}
+app.kubernetes.io/instance: {{ .Release.Name | quote }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+app.kubernetes.io/managed-by: {{ .Release.Service | quote }}
+{{- end -}}
+
+{{/*
+Selector labels
+*/}}
+{{- define "fluss.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "fluss.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
\ No newline at end of file
diff --git a/e2e-iot/high-infra/helm-charts/fluss/templates/configmap.yaml b/e2e-iot/high-infra/helm-charts/fluss/templates/configmap.yaml
new file mode 100644
index 0000000..b0e868c
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/templates/configmap.yaml
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: fluss-conf-file
+  labels:
+  {{- include "fluss.labels" . | nindent 4 }}
+data:
+  server.yaml: |
+    {{- range $key, $val := .Values.configurationOverrides }}
+    {{ $key }}: {{ tpl (printf "%v" $val) $ }}
+    {{- end }}
\ No newline at end of file
diff --git a/e2e-iot/high-infra/helm-charts/fluss/templates/sts-coordinator.yaml b/e2e-iot/high-infra/helm-charts/fluss/templates/sts-coordinator.yaml
new file mode 100644
index 0000000..f11fc4e
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/templates/sts-coordinator.yaml
@@ -0,0 +1,114 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: coordinator-server
+  labels:
+  {{- include "fluss.labels" . | nindent 4 }}
+spec:
+  serviceName: coordinator-server-hs
+  replicas: 1 # can only be 1 for now
+  selector:
+    matchLabels:
+      {{- include "fluss.selectorLabels" . | nindent 6 }}
+      app.kubernetes.io/component: coordinator
+  template:
+    metadata:
+      labels:
+        {{- include "fluss.selectorLabels" . | nindent 8 }}
+        app.kubernetes.io/component: coordinator
+    spec:
+      {{- if .Values.coordinator.nodeSelector }}
+      nodeSelector:
+        {{- toYaml .Values.coordinator.nodeSelector | nindent 8 }}
+      {{- end }}
+      {{- if .Values.coordinator.tolerations }}
+      tolerations:
+        {{- toYaml .Values.coordinator.tolerations | nindent 8 }}
+      {{- end }}
+      {{- if .Values.coordinator.affinity }}
+      affinity:
+        {{- toYaml .Values.coordinator.affinity | nindent 8 }}
+      {{- end }}
+      containers:
+        - name: {{ .Chart.Name }}-coordinator
+          image: "{{.Values.image.repository}}:{{ .Values.image.tag }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          env:
+            - name: POD_NAME
+              valueFrom: 
+                fieldRef: 
+                  fieldPath: metadata.name 
+            - name: POD_IP
+              valueFrom: 
+                fieldRef: 
+                  fieldPath: status.podIP
+            - name: POD_NAMESPACE
+              valueFrom: 
+                fieldRef: 
+                  fieldPath: metadata.namespace
+            - name: NODE_IP
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.hostIP
+          command:
+            - "/bin/sh"
+            - "-c"
+            - |
+              export FLUSS_SERVER_ID=${POD_NAME##*-} && \
+              cp /opt/conf/server.yaml $FLUSS_HOME/conf && \
+
+              echo "" >> $FLUSS_HOME/conf/server.yaml && \
+              echo "tablet-server.id: ${FLUSS_SERVER_ID}" >> $FLUSS_HOME/conf/server.yaml && \
+              # Coordinator must advertise both INTERNAL and CLIENT listeners
+              # INTERNAL listener (port 9123) is used by tablet servers for internal communication
+              # CLIENT listener (port 9124) is used by external clients
+              echo "bind.listeners: INTERNAL://0.0.0.0:{{ .Values.appConfig.internalPort }}, CLIENT://0.0.0.0:{{ .Values.appConfig.externalPort }}" >> $FLUSS_HOME/conf/server.yaml && \
+              echo "advertised.listeners: INTERNAL://${POD_IP}:{{ .Values.appConfig.internalPort }}, CLIENT://${POD_IP}:{{ .Values.appConfig.externalPort }}" >> $FLUSS_HOME/conf/server.yaml && \
+
+              bin/coordinator-server.sh start-foreground
+          livenessProbe:
+            failureThreshold: 100
+            timeoutSeconds: 1
+            initialDelaySeconds: 10
+            periodSeconds: 3
+            tcpSocket:
+              port: {{.Values.appConfig.externalPort}}
+          readinessProbe:
+            failureThreshold: 100
+            timeoutSeconds: 1
+            initialDelaySeconds: 10
+            periodSeconds: 3
+            tcpSocket:
+              port: {{.Values.appConfig.externalPort}}
+          resources:
+            {{- toYaml .Values.resources.tabletServer | nindent 12 }}
+          volumeMounts:
+            - name: fluss-conf
+              mountPath: /opt/conf
+            - name: data
+              mountPath: /tmp/fluss/data
+      volumes:
+        - name: fluss-conf
+          configMap:
+            name: fluss-conf-file
+        # Coordinator never uses persistent storage - always use emptyDir
+        - name: data
+          emptyDir: {}
\ No newline at end of file
diff --git a/e2e-iot/high-infra/helm-charts/fluss/templates/sts-tablet.yaml b/e2e-iot/high-infra/helm-charts/fluss/templates/sts-tablet.yaml
new file mode 100644
index 0000000..c9c9ed7
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/templates/sts-tablet.yaml
@@ -0,0 +1,121 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: tablet-server
+  labels:
+  {{- include "fluss.labels" . | nindent 4 }}
+spec:
+  serviceName: tablet-server-hs
+  replicas: 3
+  selector:
+    matchLabels:
+      {{- include "fluss.selectorLabels" . | nindent 6 }}
+      app.kubernetes.io/component: tablet
+  template:
+    metadata:
+      labels:
+        {{- include "fluss.selectorLabels" . | nindent 8 }}
+        app.kubernetes.io/component: tablet
+    spec:
+      {{- if .Values.tabletServer.nodeSelector }}
+      nodeSelector:
+        {{- toYaml .Values.tabletServer.nodeSelector | nindent 8 }}
+      {{- end }}
+      {{- if .Values.tabletServer.tolerations }}
+      tolerations:
+        {{- toYaml .Values.tabletServer.tolerations | nindent 8 }}
+      {{- end }}
+      {{- if .Values.tabletServer.affinity }}
+      affinity:
+        {{- toYaml .Values.tabletServer.affinity | nindent 8 }}
+      {{- end }}
+      containers:
+        - name: {{ .Chart.Name }}-tablet
+          image: "{{.Values.image.repository}}:{{ .Values.image.tag }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          env:
+            - name: POD_NAME
+              valueFrom: 
+                fieldRef: 
+                  fieldPath: metadata.name 
+            - name: POD_IP
+              valueFrom: 
+                fieldRef: 
+                  fieldPath: status.podIP
+            - name: POD_NAMESPACE
+              valueFrom: 
+                fieldRef: 
+                  fieldPath: metadata.namespace
+          command:
+            - "/bin/sh"
+            - "-c"
+            - |
+              export FLUSS_SERVER_ID=${POD_NAME##*-} && \
+              cp /opt/conf/server.yaml $FLUSS_HOME/conf && \
+              export PORT=$((9125 + ${FLUSS_SERVER_ID})) && \
+              # Create directories and symlink logs to persistent storage
+              mkdir -p /opt/alldata/fluss/data /opt/alldata/fluss/remote-data /opt/alldata/fluss/logs && \
+              rm -rf $FLUSS_HOME/log && ln -sf /opt/alldata/fluss/logs $FLUSS_HOME/log && \
+              echo "" >> $FLUSS_HOME/conf/server.yaml && \
+              echo "tablet-server.id: ${FLUSS_SERVER_ID}" >> $FLUSS_HOME/conf/server.yaml && \
+              echo "bind.listeners: INTERNAL://${POD_IP}:{{ .Values.appConfig.internalPort }}, CLIENT://0.0.0.0:{{ .Values.appConfig.externalPort }}" >> $FLUSS_HOME/conf/server.yaml && \
+              echo "advertised.listeners: CLIENT://${POD_NAME}.tablet-server-hs.${POD_NAMESPACE}.svc.cluster.local:{{ .Values.appConfig.externalPort }}" >> $FLUSS_HOME/conf/server.yaml && \
+              bin/tablet-server.sh start-foreground
+          livenessProbe:
+            failureThreshold: 100
+            timeoutSeconds: 1
+            initialDelaySeconds: 10
+            periodSeconds: 3
+            tcpSocket:
+              port: {{.Values.appConfig.externalPort}}
+          readinessProbe:
+            failureThreshold: 100
+            timeoutSeconds: 1
+            initialDelaySeconds: 10
+            periodSeconds: 3
+            tcpSocket:
+              port: {{.Values.appConfig.externalPort}}
+          resources:
+            {{- toYaml .Values.resources.tabletServer | nindent 12 }}
+          volumeMounts:
+            - name: fluss-conf
+              mountPath: /opt/conf
+            - name: data
+              mountPath: /opt/alldata/fluss
+      volumes:
+        - name: fluss-conf
+          configMap:
+            name: fluss-conf-file
+        {{- if not .Values.persistence.enabled }}
+        - name: data
+          emptyDir: {}
+        {{- end }}
+  {{- if .Values.persistence.enabled }}
+  volumeClaimTemplates:
+    - metadata:
+        name: data
+      spec:
+        accessModes: [ "ReadWriteOnce" ]
+        resources:
+          requests:
+            storage: {{ .Values.persistence.size }}
+        storageClassName: {{ .Values.persistence.storageClass }}
+  {{- end}}
\ No newline at end of file
diff --git a/e2e-iot/high-infra/helm-charts/fluss/templates/svc-coordinator.yaml b/e2e-iot/high-infra/helm-charts/fluss/templates/svc-coordinator.yaml
new file mode 100644
index 0000000..9961ee8
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/templates/svc-coordinator.yaml
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: coordinator-server-hs
+  labels:
+    {{- include "fluss.labels" . | nindent 4 }}
+    app.kubernetes.io/component: coordinator
+    app: fluss-coordinator
+spec:
+  clusterIP: None
+  type: ClusterIP
+  ports:
+    - name: internal
+      protocol: TCP
+      port: {{ .Values.appConfig.internalPort }}
+      targetPort: {{ .Values.appConfig.internalPort }}
+    - name: client
+      protocol: TCP
+      port: {{ .Values.appConfig.externalPort }}
+      targetPort: {{ .Values.appConfig.externalPort }}
+    - name: metrics
+      protocol: TCP
+      port: 9249
+      targetPort: 9249
+  selector:
+    {{- include "fluss.selectorLabels" . | nindent 4 }}
+    app.kubernetes.io/component: coordinator
\ No newline at end of file
diff --git a/e2e-iot/high-infra/helm-charts/fluss/templates/svc-tablet.yaml b/e2e-iot/high-infra/helm-charts/fluss/templates/svc-tablet.yaml
new file mode 100644
index 0000000..442c30c
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/templates/svc-tablet.yaml
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: tablet-server-hs
+  labels:
+    {{- include "fluss.labels" . | nindent 4 }}
+    app.kubernetes.io/component: tablet
+    app: fluss-tablet
+spec:
+  clusterIP: None
+  type: ClusterIP
+  ports:
+    - name: internal
+      protocol: TCP
+      port: {{ .Values.appConfig.internalPort }}
+      targetPort: {{ .Values.appConfig.internalPort }}
+    - name: client
+      protocol: TCP
+      port: {{ .Values.appConfig.externalPort }}
+      targetPort: {{ .Values.appConfig.externalPort }}
+    - name: metrics
+      protocol: TCP
+      port: 9249
+      targetPort: 9249
+  selector:
+    {{- include "fluss.selectorLabels" . | nindent 4 }}
+    app.kubernetes.io/component: tablet
\ No newline at end of file
diff --git a/e2e-iot/high-infra/helm-charts/fluss/values.yaml b/e2e-iot/high-infra/helm-charts/fluss/values.yaml
new file mode 100644
index 0000000..09de8c7
--- /dev/null
+++ b/e2e-iot/high-infra/helm-charts/fluss/values.yaml
@@ -0,0 +1,84 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default values for fluss.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+image:
+  registry: ""
+  repository: fluss
+  tag: "0.8.0-incubating"
+  pullPolicy: IfNotPresent
+  pullSecrets: []
+
+appConfig:
+  internalPort: 9123
+  externalPort: 9124
+
+
+# Fluss server configuration options
+configurationOverrides:
+  default.bucket.number: 3
+  default.replication.factor: 3
+  zookeeper.path.root: /fluss
+  zookeeper.address: zk-zookeeper.{{ .Release.Namespace }}.svc.cluster.local:2181
+  remote.data.dir: /opt/alldata/fluss/remote-data
+  data.dir: /opt/alldata/fluss/data
+  # Log directory - logs are rotatable but should be on persistent storage
+  # Fluss writes logs to /opt/fluss/log by default, but we'll symlink it to persistent storage
+  internal.listener.name: INTERNAL
+  # Prometheus metrics configuration
+  metrics.reporters: prometheus
+  metrics.reporter.prometheus.class: org.apache.fluss.metrics.prometheus.PrometheusReporterPlugin
+  metrics.reporter.prometheus.port: "9249"
+
+persistence:
+  # Note: Coordinator never uses persistent storage (always uses emptyDir)
+  # Only tablet servers use persistent volumes when enabled=true
+  enabled: true
+  size: 500Gi
+  storageClass: local-storage
+
+resources:
+  coordinatorServer:
+    requests:
+      cpu: 2000m
+      memory: 4Gi
+    limits:
+      cpu: 4000m
+      memory: 8Gi
+  tabletServer:
+    requests:
+      cpu: 8000m
+      memory: 16Gi
+    limits:
+      cpu: 16000m
+      memory: 32Gi
+
+coordinator:
+  replicas: 1
+  nodeSelector: {}
+  tolerations: []
+  affinity: {}
+
+tabletServer:
+  replicas: 3
+  nodeSelector: {}
+  tolerations: []
+  affinity: {}
\ No newline at end of file
diff --git a/e2e-iot/high-infra/instruction.md b/e2e-iot/high-infra/instruction.md
new file mode 100644
index 0000000..774887e
--- /dev/null
+++ b/e2e-iot/high-infra/instruction.md
@@ -0,0 +1,130 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Deployment Instructions
+
+## ⚠️ IMPORTANT: Read Before Deployment
+
+**Before starting any deployment, you MUST review the following documentation:**
+
+1. **[DEPLOY-STEPS.md](./DEPLOY-STEPS.md)** - Complete step-by-step deployment guide
+2. **[DEPLOYMENT_FIXES.md](./DEPLOYMENT_FIXES.md)** - Known issues and fixes applied
+3. **[k8s/DEPLOYMENT.md](./k8s/DEPLOYMENT.md)** - Detailed Kubernetes deployment guide with NVMe storage verification and multi-instance producer setup
+
+These documents contain critical information about:
+- Prerequisites and setup requirements
+- Step-by-step deployment procedures
+- Common issues and their solutions
+- Configuration requirements
+- Troubleshooting steps
+
+## Quick Start
+
+If you're familiar with the deployment process, you can proceed directly to:
+
+```bash
+cd k8s
+./deploy.sh <namespace> <demo-image-repo> <demo-image-tag> <fluss-image-repo>
+```
+
+However, **it is strongly recommended** to review the documentation files above, especially:
+- If this is your first deployment
+- If you're deploying after infrastructure changes
+- If you encounter any issues during deployment
+- If you're updating configurations
+
+## Documentation Files
+
+### DEPLOY-STEPS.md
+Contains:
+- Prerequisites checklist
+- Complete deployment steps
+- Post-deployment verification
+- Monitoring setup
+- Troubleshooting guide
+
+### DEPLOYMENT_FIXES.md
+Contains:
+- Known issues and their fixes
+- Workarounds for common problems
+- Configuration changes applied
+- Best practices learned from previous deployments
+
+### k8s/DEPLOYMENT.md
+Contains:
+- **NVMe storage verification steps** (critical for tablet servers)
+- **Multi-instance producer deployment** (8 instances, 2 per node)
+- **Flink job deployment with S3 checkpoints**
+- Detailed Kubernetes resource verification
+- Service access instructions (Flink UI, Grafana, Prometheus)
+- Architecture overview
+- Comprehensive troubleshooting guide
+
+## Additional Resources
+- **[MONITORING.md](./MONITORING.md)** - Monitoring setup and configuration
+- **[k8s/jobs/PRODUCER_CONFIG.md](./k8s/jobs/PRODUCER_CONFIG.md)** - Producer optimal configuration and performance tuning
+- **[README.md](./README.md)** - General overview and architecture
+
+## Deployment Checklist
+
+Before deploying, ensure:
+
+- [ ] Reviewed `DEPLOY-STEPS.md` for prerequisites
+- [ ] Reviewed `DEPLOYMENT_FIXES.md` for known issues
+- [ ] Reviewed `k8s/DEPLOYMENT.md` for NVMe storage and multi-instance producer setup
+- [ ] Terraform infrastructure is deployed
+- [ ] ECR images are built and pushed
+- [ ] `kubectl` is configured for the EKS cluster
+- [ ] `helm` is installed
+- [ ] All required environment variables are set
+
+## Critical Post-Deployment Steps
+
+After deploying Fluss components, **MUST verify** (see **[k8s/DEPLOYMENT.md](./k8s/DEPLOYMENT.md)** for detailed steps):
+
+1. **NVMe Storage Verification** (Step 4 in k8s/DEPLOYMENT.md):
+   - Verify tablet server PersistentVolumes are using NVMe paths (`/opt/alldata/fluss/data`)
+   - Check tablet server pods have volumes mounted correctly
+   - Confirm NVMe drives are mounted on tablet server nodes
+   - **Reference:** See `k8s/DEPLOYMENT.md` Step 4 for complete verification commands
+
+2. **Multi-Instance Producer Deployment** (Step 5 in k8s/DEPLOYMENT.md):
+   - Deploy using `k8s/jobs/deploy-producer-multi-instance.sh` with 96 buckets
+   - Verify 8 producer instances are running (2 per node across 4 nodes)
+   - Table must be created with 96 buckets before deploying producers
+   - Check producer metrics and throughput
+   - **Reference:** See `k8s/DEPLOYMENT.md` Step 5 for deployment and verification
+
+3. **Flink Job Deployment** (Step 6 in k8s/DEPLOYMENT.md):
+   - Submit Flink job using `k8s/flink/submit-job-from-image.sh`
+   - Verify S3 checkpoints are configured automatically
+   - Check Flink job is running and processing data
+   - **Reference:** See `k8s/DEPLOYMENT.md` Step 6 for S3 checkpoint verification
+
+## Getting Help
+
+If you encounter issues:
+1. Check `DEPLOYMENT_FIXES.md` for known issues
+2. Review deployment logs in `k8s/deploy-*.log`
+3. Check pod logs: `kubectl logs -n fluss <pod-name>`
+4. Verify configuration matches `DEPLOY-STEPS.md`
+
+---
+
+**Remember**: Always refer to `DEPLOY-STEPS.md`, `DEPLOYMENT_FIXES.md`, and `k8s/DEPLOYMENT.md` before deployment to avoid common pitfalls and ensure a smooth deployment process. The `k8s/DEPLOYMENT.md` file contains critical information about NVMe storage verification and multi-instance producer deployment.
+
diff --git a/e2e-iot/high-infra/k8s/DEPLOYMENT.md b/e2e-iot/high-infra/k8s/DEPLOYMENT.md
new file mode 100644
index 0000000..e31e961
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/DEPLOYMENT.md
@@ -0,0 +1,377 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Complete Deployment Guide
+
+This guide walks through deploying the entire Fluss + Flink stack on AWS EKS.
+
+## Prerequisites
+
+1. **AWS CLI configured** with appropriate credentials
+2. **Terraform** installed (>= 1.0)
+3. **kubectl** installed and configured
+4. **helm** installed (>= 3.0)
+5. **Docker images** built and pushed to ECR:
+   - Fluss image: `fluss:0.8.0-incubating`
+   - Demo image: `fluss-demo:latest` (contains producer and Flink job JAR)
+
+## Step 1: Create EKS Cluster and Node Groups
+
+```bash
+cd aws-deploy-fluss/low-infra/terraform
+
+# Initialize Terraform
+terraform init
+
+# Review the plan
+terraform plan
+
+# Apply infrastructure
+terraform apply
+```
+
+This creates:
+- VPC with public/private subnets
+- EKS cluster
+- Node groups:
+  - Coordinator nodes (1 node)
+  - Tablet server nodes (3 nodes)
+  - Flink JobManager node (1 node)
+  - Flink TaskManager nodes (2 nodes)
+- ECR repositories
+- EBS CSI driver
+
+**Wait for all nodes to join the cluster:**
+```bash
+kubectl get nodes
+# Should show 7 nodes total
+```
+
+## Step 2: Configure kubectl
+
+```bash
+# Get kubeconfig
+aws eks update-kubeconfig --name fluss-eks-cluster --region us-west-2
+
+# Verify access
+kubectl get nodes
+```
+
+## Step 3: Deploy All Kubernetes Resources
+
+```bash
+cd aws-deploy-fluss/low-infra/k8s
+
+# Get ECR repository URLs from Terraform outputs
+cd ../terraform
+DEMO_IMAGE_REPO=$(terraform output -raw ecr_repository_url)
+FLUSS_IMAGE_REPO=$(terraform output -raw ecr_fluss_repository_url)
+
+# Deploy everything
+cd ../k8s
+./deploy.sh fluss "${DEMO_IMAGE_REPO}" latest "${FLUSS_IMAGE_REPO}"
+```
+
+The deployment script will:
+1. Create namespace
+2. Deploy ZooKeeper
+3. Deploy Fluss (via Helm)
+4. Deploy Flink cluster (JobManager + TaskManagers)
+5. Deploy monitoring stack (Prometheus + Grafana)
+6. Deploy ServiceMonitors and PodMonitors
+7. Wait for components to be ready
+
+**Note:** Producer and Flink job deployment are done separately (see steps below).
+
+## Step 4: Verify NVMe Storage for Tablet Servers
+
+**IMPORTANT:** Verify that tablet server storage is using NVMe drives before proceeding.
+
+### Check PersistentVolumes are using NVMe:
+```bash
+# Verify PVs exist and are bound
+kubectl get pv -l component=tablet-server
+
+# Check PV details - should show path: /opt/alldata/fluss/data
+kubectl get pv -l component=tablet-server -o yaml | grep -A 5 "path:"
+
+# Verify PVCs are bound to PVs
+kubectl get pvc -n fluss
+```
+
+### Verify tablet server pods are using NVMe storage:
+```bash
+# Check tablet server pods and their volumes
+kubectl get pods -n fluss -l app=fluss,component=tablet-server -o wide
+
+# Verify mount paths inside tablet server pods
+kubectl exec -n fluss <tablet-server-pod-name> -- df -h | grep alldata
+
+# Check that data directory exists on NVMe
+kubectl exec -n fluss <tablet-server-pod-name> -- ls -la /opt/alldata/fluss/
+```
+
+### Verify NVMe drives are mounted on nodes:
+```bash
+# Get tablet server node names
+TABLET_NODES=$(kubectl get nodes -l fluss-component=tablet-server -o jsonpath='{.items[*].metadata.name}')
+
+# Check NVMe mount on each node (requires node debug access)
+for node in $TABLET_NODES; do
+  echo "Checking node: $node"
+  kubectl debug node/$node -it --image=busybox -- sh -c "df -h | grep alldata || echo 'NVMe not mounted'"
+done
+```
+
+**Expected Results:**
+- PVs should show `path: /opt/alldata/fluss/data`
+- Tablet server pods should have volumes mounted at `/opt/alldata/fluss`
+- Nodes should show NVMe drives mounted at `/opt/alldata`
+- Data directory should exist: `/opt/alldata/fluss/data`
+
+## Step 5: Deploy Multi-Instance Producer
+
+Deploy 8 producer instances (2 per node across 4 producer nodes) with 128 buckets:
+
+```bash
+cd aws-deploy-fluss/high-infra/k8s/jobs
+
+# Deploy multi-instance producer (8 instances total, 2 per node, 128 buckets)
+export BUCKETS=128
+./deploy-producer-multi-instance.sh
+
+# Or with custom parameters:
+export PRODUCER_RATE=250000
+export TOTAL_PRODUCERS=8
+export BUCKETS=128
+./deploy-producer-multi-instance.sh
+```
+
+This will:
+- Deploy 8 producer jobs (instance IDs 0-7)
+- Ensure 2 pods per producer node using topology spread constraints
+- Each producer runs at the specified rate (default: 250K records/sec per instance)
+- Uses 128 buckets for the Fluss table (must match table bucket count)
+
+**Verify producer deployment:**
+```bash
+# Check producer pods (should see 8 pods, 2 per node)
+kubectl get pods -n fluss -l app=fluss-producer -o wide
+
+# Check producer metrics
+kubectl logs -n fluss -l app=fluss-producer --tail=50
+```
+
+## Step 6: Deploy Flink Job
+
+Submit the Flink aggregator job:
+
+```bash
+cd aws-deploy-fluss/high-infra/k8s/flink
+
+# Submit Flink job (automatically configures S3 checkpoints)
+./submit-job-from-image.sh
+```
+
+This script will:
+1. Update Flink ConfigMap with S3 checkpoint paths (from Terraform outputs)
+2. Restart Flink pods to apply new configuration
+3. Submit the Flink aggregator job
+
+**Note:** S3 checkpoint configuration is automatically handled by the script.
+
+## Step 7: Verify Deployment
+
+### Check all pods are running:
+```bash
+kubectl get pods -n fluss
+kubectl get pods -n monitoring
+```
+
+### Verify Flink cluster:
+```bash
+# Check Flink pods
+kubectl get pods -n fluss -l app=flink
+
+# Verify node placement
+kubectl get pods -n fluss -l app=flink -o wide
+kubectl get nodes -l flink-component --show-labels
+```
+
+### Verify Flink S3 Checkpoints:
+```bash
+# Get S3 bucket name from Terraform
+cd aws-deploy-fluss/high-infra/terraform
+S3_BUCKET=$(terraform output -raw flink_s3_bucket_name)
+
+# Check checkpoints are being written to S3
+aws s3 ls s3://${S3_BUCKET}/flink-checkpoints/fluss-eks-cluster/ --recursive
+
+# Verify checkpoint configuration in Flink ConfigMap
+kubectl get configmap flink-config -n fluss -o yaml | grep -A 2 "state.checkpoints.dir"
+```
+
+### Verify monitoring:
+```bash
+# Check ServiceMonitors
+kubectl get servicemonitor -n fluss
+
+# Check PodMonitors
+kubectl get podmonitor -n fluss
+
+# Check Prometheus targets (after port-forwarding)
+kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
+# Open http://localhost:9090/targets
+```
+
+## Step 8: Access Services
+
+### Flink Web UI
+```bash
+kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081
+# Open http://localhost:8081
+```
+
+### Grafana
+```bash
+GRAFANA_SVC=$(kubectl get svc -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')
+kubectl port-forward -n monitoring svc/$GRAFANA_SVC 3000:80
+# Open http://localhost:3000
+# Username: admin
+# Password: admin123
+```
+
+### Prometheus
+```bash
+PROM_SVC=$(kubectl get svc -n monitoring -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].metadata.name}')
+kubectl port-forward -n monitoring svc/$PROM_SVC 9090:9090
+# Open http://localhost:9090
+```
+
+## Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                      EKS Cluster                            │
+├─────────────────────────────────────────────────────────────┤
+│                                                             │
+│  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐     │
+│  │ Coordinator │  │ Tablet Svr 1 │  │ Tablet Svr 2 │     │
+│  │   (1 node)  │  │   (1 node)   │  │   (1 node)   │     │
+│  └──────────────┘  └──────────────┘  └──────────────┘     │
+│                                                             │
+│  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐     │
+│  │ JobManager  │  │ TaskManager │  │ TaskManager │     │
+│  │   (1 node)  │  │   (1 node)  │  │   (1 node)  │     │
+│  └──────────────┘  └──────────────┘  └──────────────┘     │
+│                                                             │
+│  ┌──────────────┐  ┌──────────────┐                       │
+│  │  Producer    │  │  Monitoring │                       │
+│  │ (8 instances │  │  (Prom/Graf)│                       │
+│  │  2 per node) │  │             │                       │
+│  └──────────────┘  └──────────────┘                       │
+└─────────────────────────────────────────────────────────────┘
+```
+
+## Troubleshooting
+
+### Pods not starting
+```bash
+# Check pod events
+kubectl describe pod <pod-name> -n fluss
+
+# Check logs
+kubectl logs <pod-name> -n fluss
+```
+
+### Flink job not submitted
+```bash
+# Check job submission logs
+kubectl logs -n fluss -l app=flink-job-submission
+
+# Check if Flink JobManager is ready
+kubectl get pods -n fluss -l component=jobmanager
+kubectl logs -n fluss -l component=jobmanager
+```
+
+### Metrics not appearing
+```bash
+# Check if ServiceMonitors are created
+kubectl get servicemonitor -n fluss
+
+# Check Prometheus targets
+# Port-forward Prometheus and check /targets endpoint
+
+# Verify metrics endpoints
+kubectl port-forward -n fluss <pod-name> 8080:8080
+curl http://localhost:8080/metrics
+```
+
+### Node placement issues
+```bash
+# Check node labels
+kubectl get nodes --show-labels
+
+# Check pod node placement
+kubectl get pods -n fluss -o wide
+
+# Check pod events for scheduling issues
+kubectl describe pod <pod-name> -n fluss | grep -A 10 Events
+```
+
+### Tablet server storage not using NVMe
+```bash
+# Check PV bindings
+kubectl get pv -l component=tablet-server
+kubectl get pvc -n fluss
+
+# Verify PV paths point to NVMe mount
+kubectl get pv fluss-tablet-data-0 -o jsonpath='{.spec.local.path}'
+# Should show: /opt/alldata/fluss/data
+
+# Check tablet server pod volumes
+kubectl describe pod <tablet-server-pod> -n fluss | grep -A 10 "Mounts:"
+
+# Verify NVMe is mounted on node
+kubectl debug node/<tablet-node> -it --image=busybox -- df -h | grep alldata
+```
+
+### Producer pods not distributing correctly
+```bash
+# Check topology spread constraints
+kubectl get pods -n fluss -l app=fluss-producer -o wide
+
+# Verify 2 pods per producer node
+kubectl get pods -n fluss -l app=fluss-producer -o wide | awk '{print $7}' | sort | uniq -c
+
+# Check producer job configuration
+kubectl get job -n fluss -l app=fluss-producer -o yaml | grep -A 5 topologySpreadConstraints
+```
+
+## Cleanup
+
+To destroy everything:
+```bash
+# Delete Kubernetes resources
+kubectl delete namespace fluss monitoring
+
+# Destroy Terraform infrastructure
+cd aws-deploy-fluss/low-infra/terraform
+terraform destroy
+```
+
diff --git a/e2e-iot/high-infra/k8s/README.md b/e2e-iot/high-infra/k8s/README.md
new file mode 100644
index 0000000..f8a8da5
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/README.md
@@ -0,0 +1,222 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Kubernetes Deployment for Fluss + Flink
+
+This directory contains Kubernetes YAML manifests for deploying Fluss, Flink, and related components.
+
+## Structure
+
+```
+k8s/
+├── namespace/          # Namespace definition
+├── zookeeper/         # ZooKeeper StatefulSet and Service
+├── flink/             # Flink cluster (JobManager + TaskManagers)
+├── jobs/              # Producer and Flink aggregator jobs
+├── monitoring/        # Monitoring resources (Prometheus/Grafana)
+└── deploy.sh          # Deployment script
+```
+
+## Prerequisites
+
+1. EKS cluster created via Terraform (see `../terraform/`)
+2. `kubectl` configured to access the cluster
+3. `helm` installed
+4. Docker images pushed to ECR (or accessible registry)
+
+## Deployment
+
+### Quick Deploy
+
+```bash
+cd aws-deploy-fluss/low-infra/k8s
+./deploy.sh fluss <demo-image-repo> <demo-image-tag> <fluss-image-repo>
+```
+
+Example:
+```bash
+./deploy.sh fluss \
+  123456789012.dkr.ecr.us-west-2.amazonaws.com/fluss-demo \
+  latest \
+  123456789012.dkr.ecr.us-west-2.amazonaws.com/fluss:0.8.0-incubating
+```
+
+### Manual Deploy
+
+1. **Create namespace:**
+   ```bash
+   kubectl apply -f namespace/namespace.yaml
+   ```
+
+2. **Deploy ZooKeeper:**
+   ```bash
+   kubectl apply -f zookeeper/zookeeper.yaml
+   kubectl wait --for=condition=ready pod -l app=zookeeper -n fluss --timeout=120s
+   ```
+
+3. **Deploy Fluss via Helm:**
+   ```bash
+   helm upgrade --install fluss ../helm-charts/fluss \
+     --namespace fluss \
+     --set image.repository="<fluss-image-repo>" \
+     --set image.tag="<fluss-image-tag>" \
+     --set configurationOverrides."zookeeper\.address"="zk-svc.fluss.svc.cluster.local:2181"
+   ```
+
+4. **Deploy Flink cluster:**
+   ```bash
+   kubectl apply -f flink/flink-config.yaml
+   kubectl apply -f flink/flink-jobmanager.yaml
+   kubectl apply -f flink/flink-taskmanager.yaml
+   ```
+
+5. **Deploy monitoring:**
+   ```bash
+   kubectl create namespace monitoring
+   helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+   helm repo update
+   helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \
+     --version 55.5.0 \
+     --namespace monitoring \
+     --set grafana.enabled=true \
+     --set grafana.adminUser=admin \
+     --set grafana.adminPassword=admin123 \
+     --set grafana.service.type=LoadBalancer
+   ```
+
+6. **Deploy jobs (with image substitution):**
+   ```bash
+   export DEMO_IMAGE_REPO="<demo-image-repo>"
+   export DEMO_IMAGE_TAG="<demo-image-tag>"
+   # Deploy producer job (standalone)
+   envsubst < jobs/producer-job.yaml | kubectl apply -f -
+   # Submit Flink aggregator job to Flink cluster
+   envsubst < flink/flink-job-submission-simple.yaml | kubectl apply -f -
+   ```
+
+## Flink Cluster
+
+The Flink cluster consists of:
+- **JobManager**: 1 replica (Deployment)
+- **TaskManagers**: 2 replicas (StatefulSet)
+- **Image**: `apache/flink:1.20.3-scala_2.12-java17`
+
+### Access Flink Web UI
+
+```bash
+kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081
+```
+
+Then open http://localhost:8081
+
+### Submit Flink Job
+
+The Flink aggregator job is automatically submitted to the Flink cluster via the `flink-job-submission-simple.yaml` job.
+
+To manually submit a job:
+
+```bash
+# Method 1: Use the job submission job (recommended)
+export DEMO_IMAGE_REPO="<demo-image-repo>"
+export DEMO_IMAGE_TAG="<demo-image-tag>"
+envsubst < flink/flink-job-submission-simple.yaml | kubectl apply -f -
+
+# Method 2: Use Flink CLI directly
+FLINK_JM_POD=$(kubectl get pod -n fluss -l component=jobmanager -o jsonpath='{.items[0].metadata.name}')
+
+# Copy JAR to pod
+kubectl cp /path/to/fluss-flink-realtime-demo.jar $FLINK_JM_POD:/tmp/job.jar -n fluss
+
+# Submit job via Flink CLI
+kubectl exec -n fluss $FLINK_JM_POD -- \
+  /opt/flink/bin/flink run \
+  -m flink-jobmanager:6123 \
+  -c org.apache.fluss.benchmarks.flink.FlinkSensorAggregatorJob \
+  /tmp/job.jar \
+  --bootstrap coordinator-server-hs.fluss.svc.cluster.local:9124 \
+  --database iot \
+  --table sensor_readings \
+  --window-minutes 1
+
+# Method 3: Use Flink REST API
+JOBMANAGER="flink-jobmanager.fluss.svc.cluster.local:8081"
+
+# Upload JAR
+JAR_ID=$(curl -s -X POST \
+  "http://${JOBMANAGER}/v1/jars/upload" \
+  -H "Content-Type: multipart/form-data" \
+  -F "jarfile=@/path/to/fluss-flink-realtime-demo.jar" \
+  | jq -r '.filename' | sed 's|.*/||')
+
+# Submit job
+curl -X POST \
+  "http://${JOBMANAGER}/v1/jars/${JAR_ID}/run" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "entryClass": "org.apache.fluss.benchmarks.flink.FlinkSensorAggregatorJob",
+    "programArgs": "--bootstrap coordinator-server-hs.fluss.svc.cluster.local:9124 --database iot --table sensor_readings --window-minutes 1",
+    "parallelism": 2
+  }'
+```
+
+## Monitoring
+
+### Access Grafana
+
+```bash
+# Get Grafana LoadBalancer URL
+kubectl get svc -n monitoring | grep grafana
+
+# Or port-forward
+kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80
+```
+
+Default credentials:
+- Username: `admin`
+- Password: `admin123`
+
+### Prometheus Metrics
+
+All components expose Prometheus metrics:
+- **Producer**: Port 8080, path `/metrics`
+- **Flink Aggregator**: Port 9249, path `/metrics`
+- **Flink JobManager**: Port 9249, path `/metrics`
+- **Flink TaskManagers**: Port 9249, path `/metrics`
+- **Fluss Servers**: Port 9249, path `/metrics`
+
+## Troubleshooting
+
+### Check pod status:
+```bash
+kubectl get pods -n fluss
+kubectl get pods -n monitoring
+```
+
+### View logs:
+```bash
+kubectl logs -n fluss <pod-name>
+kubectl logs -n fluss -l app=fluss-producer --tail=50 -f
+kubectl logs -n fluss -l app=flink-aggregator --tail=50 -f
+```
+
+### Check services:
+```bash
+kubectl get svc -n fluss
+kubectl get svc -n monitoring
+```
+
diff --git a/e2e-iot/high-infra/k8s/deploy.sh b/e2e-iot/high-infra/k8s/deploy.sh
new file mode 100755
index 0000000..8bfadd7
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/deploy.sh
@@ -0,0 +1,296 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -euo pipefail
+
+# Deployment script for Kubernetes resources
+# Usage: ./deploy.sh [namespace] [demo-image-repo] [demo-image-tag] [fluss-image-repo]
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="${SCRIPT_DIR}"
+
+NAMESPACE="${1:-fluss}"
+DEMO_IMAGE_REPO="${2:-}"
+DEMO_IMAGE_TAG="${3:-latest}"
+FLUSS_IMAGE_REPO="${4:-apache/fluss:0.8.0-incubating}"
+
+# Export variables for envsubst
+export NAMESPACE
+export DEMO_IMAGE_REPO
+export DEMO_IMAGE_TAG
+
+echo "=== Deploying Kubernetes Resources ==="
+echo "Namespace: ${NAMESPACE}"
+echo "Demo Image: ${DEMO_IMAGE_REPO}:${DEMO_IMAGE_TAG}"
+echo "Fluss Image: ${FLUSS_IMAGE_REPO}"
+echo ""
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Check helm is available (for Fluss and monitoring)
+if ! command -v helm &> /dev/null; then
+    echo "ERROR: helm is not installed or not in PATH"
+    exit 1
+fi
+
+# 1. Create namespace
+echo "[1/8] Creating namespace..."
+kubectl apply -f "${K8S_DIR}/namespace/namespace.yaml"
+
+# 2. Deploy ZooKeeper
+echo "[2/8] Deploying ZooKeeper..."
+kubectl apply -f "${K8S_DIR}/zookeeper/zookeeper.yaml"
+
+# Wait for ZooKeeper to be ready
+echo "Waiting for ZooKeeper to be ready..."
+kubectl wait --for=condition=ready pod -l app=zookeeper -n ${NAMESPACE} --timeout=120s || true
+
+# 3. Deploy Fluss via Helm
+echo "[3/8] Deploying Fluss via Helm..."
+if [ -n "${FLUSS_IMAGE_REPO}" ]; then
+    # Extract registry, repository, and tag from image
+    if [[ "${FLUSS_IMAGE_REPO}" == *".dkr.ecr."* ]]; then
+        # ECR format: <account>.dkr.ecr.<region>.amazonaws.com/<repo> or <account>.dkr.ecr.<region>.amazonaws.com/<repo>:<tag>
+        if [[ "${FLUSS_IMAGE_REPO}" == *":"* ]]; then
+            # Has tag
+            FLUSS_REPO_WITHOUT_TAG="${FLUSS_IMAGE_REPO%%:*}"
+            FLUSS_TAG="${FLUSS_IMAGE_REPO##*:}"
+        else
+            # No tag, use default
+            FLUSS_REPO_WITHOUT_TAG="${FLUSS_IMAGE_REPO}"
+            FLUSS_TAG="0.8.0-incubating"
+        fi
+        # For ECR, registry is empty and repository is the full ECR URL without tag
+        FLUSS_REGISTRY=""
+        FLUSS_REPO="${FLUSS_REPO_WITHOUT_TAG}"
+    else
+        # Docker Hub format: <repo>:<tag> or <registry>/<repo>:<tag>
+        if [[ "${FLUSS_IMAGE_REPO}" == *":"* ]]; then
+            FLUSS_REPO="${FLUSS_IMAGE_REPO%%:*}"
+            FLUSS_TAG="${FLUSS_IMAGE_REPO##*:}"
+        else
+            FLUSS_REPO="${FLUSS_IMAGE_REPO}"
+            FLUSS_TAG="0.8.0-incubating"
+        fi
+        FLUSS_REGISTRY="docker.io"
+    fi
+    
+    helm upgrade --install fluss "${SCRIPT_DIR}/../helm-charts/fluss" \
+        --namespace ${NAMESPACE} \
+        --set image.registry="${FLUSS_REGISTRY}" \
+        --set image.repository="${FLUSS_REPO}" \
+        --set image.tag="${FLUSS_TAG}" \
+        --set persistence.enabled=true \
+        --set persistence.storageClass=local-storage \
+        --set persistence.size=500Gi \
+        --set configurationOverrides."zookeeper\.address"="zk-svc.${NAMESPACE}.svc.cluster.local:2181" \
+        --wait=false
+else
+    helm upgrade --install fluss "${SCRIPT_DIR}/../helm-charts/fluss" \
+        --namespace ${NAMESPACE} \
+        --set persistence.enabled=true \
+        --set persistence.storageClass=local-storage \
+        --set persistence.size=500Gi \
+        --set configurationOverrides."zookeeper\.address"="zk-svc.${NAMESPACE}.svc.cluster.local:2181" \
+        --wait=false
+fi
+
+# 4. Deploy Flink cluster
+echo "[4/8] Deploying Flink cluster..."
+# Flink image is hardcoded to apache/flink:1.20.3-scala_2.12-java17
+# Use envsubst for namespace and DEMO_IMAGE_REPO/DEMO_IMAGE_TAG (for init container)
+# Create Flink service account first
+envsubst < "${K8S_DIR}/flink/flink-serviceaccount.yaml" | kubectl apply -f -
+# Apply ConfigMap with namespace substitution
+envsubst < "${K8S_DIR}/flink/flink-config.yaml" | kubectl apply -f -
+# Apply JobManager and TaskManager (namespace and DEMO_IMAGE_REPO/DEMO_IMAGE_TAG for init container)
+envsubst < "${K8S_DIR}/flink/flink-jobmanager.yaml" | kubectl apply -f -
+envsubst < "${K8S_DIR}/flink/flink-taskmanager.yaml" | kubectl apply -f -
+
+# 4.1. Update Flink ConfigMap with S3 checkpoint configuration
+echo "[4.1/9] Updating Flink ConfigMap with S3 checkpoint configuration..."
+TERRAFORM_DIR="${SCRIPT_DIR}/../terraform"
+CLUSTER_NAME="fluss-eks-cluster"
+
+if command -v terraform &> /dev/null && [ -d "${TERRAFORM_DIR}" ]; then
+    cd "${TERRAFORM_DIR}"
+    S3_BUCKET=$(terraform output -raw flink_s3_bucket_name 2>/dev/null || echo "")
+    cd - > /dev/null
+    
+    if [ -n "$S3_BUCKET" ]; then
+        echo "  S3 Bucket: $S3_BUCKET"
+        
+        # Get current ConfigMap
+        CURRENT_CONFIG=$(kubectl get configmap flink-config -n "${NAMESPACE}" -o jsonpath='{.data.flink-conf\.yaml}' 2>/dev/null || echo "")
+        
+        if [ -n "$CURRENT_CONFIG" ]; then
+            # Replace placeholder with actual bucket name (using s3:// as in reference)
+            UPDATED_CONFIG=$(echo "$CURRENT_CONFIG" | \
+                sed "s|s3://fluss-eks-cluster-flink-state-PLACEHOLDER/flink-checkpoints/fluss-eks-cluster/|s3://${S3_BUCKET}/flink-checkpoints/${CLUSTER_NAME}/|g" | \
+                sed "s|s3://fluss-eks-cluster-flink-state-PLACEHOLDER/flink-savepoints/fluss-eks-cluster/|s3://${S3_BUCKET}/flink-savepoints/${CLUSTER_NAME}/|g" | \
+                sed "s|s3a://fluss-eks-cluster-flink-state-PLACEHOLDER/flink-checkpoints/fluss-eks-cluster/|s3://${S3_BUCKET}/flink-checkpoints/${CLUSTER_NAME}/|g" | \
+                sed "s|s3a://fluss-eks-cluster-flink-state-PLACEHOLDER/flink-savepoints/fluss-eks-cluster/|s3://${S3_BUCKET}/flink-savepoints/${CLUSTER_NAME}/|g")
+            
+            # Update ConfigMap
+            if command -v jq &> /dev/null; then
+                kubectl patch configmap flink-config -n "${NAMESPACE}" \
+                    --type merge \
+                    -p "{\"data\":{\"flink-conf.yaml\":$(echo "$UPDATED_CONFIG" | jq -Rs .)}}" 2>/dev/null && \
+                echo "  ✓ ConfigMap updated with S3 checkpoint paths"
+            else
+                echo "  ⚠ jq not found, skipping S3 ConfigMap update (will use placeholder)"
+            fi
+        else
+            echo "  ⚠ ConfigMap not found yet, will be updated when Flink pods are ready"
+        fi
+    else
+        echo "  ⚠ S3 bucket not found in Terraform outputs, skipping S3 configuration"
+    fi
+else
+    echo "  ⚠ Terraform not found or directory missing, skipping S3 configuration update"
+fi
+
+# 5. Deploy monitoring (Prometheus/Grafana)
+echo "[5/8] Deploying monitoring stack..."
+kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f -
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+helm repo update
+helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \
+    --version 55.5.0 \
+    --namespace monitoring \
+    --set prometheus.prometheusSpec.retention=30d \
+    --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
+    --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \
+    --set grafana.enabled=true \
+    --set grafana.adminUser=admin \
+    --set grafana.adminPassword=admin123 \
+    --set grafana.service.type=LoadBalancer \
+    --set alertmanager.enabled=false \
+    --wait=false
+
+# 6. Deploy ServiceMonitors and PodMonitors for Prometheus
+echo "[6/8] Deploying ServiceMonitors and PodMonitors for Prometheus..."
+if [ -f "${K8S_DIR}/monitoring/servicemonitors.yaml" ]; then
+kubectl apply -f "${K8S_DIR}/monitoring/servicemonitors.yaml"
+    echo "  ✓ ServiceMonitors deployed"
+else
+    echo "  WARNING: servicemonitors.yaml not found, skipping..."
+fi
+if [ -f "${K8S_DIR}/monitoring/podmonitors.yaml" ]; then
+kubectl apply -f "${K8S_DIR}/monitoring/podmonitors.yaml"
+    echo "  ✓ PodMonitors deployed"
+else
+    echo "  WARNING: podmonitors.yaml not found, skipping..."
+fi
+
+# 7. Deploy Grafana dashboard (if exists)
+echo "[7/8] Deploying Grafana dashboard..."
+if [ -f "${K8S_DIR}/monitoring/grafana-dashboard.yaml" ]; then
+    kubectl apply -f "${K8S_DIR}/monitoring/grafana-dashboard.yaml"
+    echo "  ✓ Grafana dashboard ConfigMap deployed"
+    
+    # Import dashboard via Grafana API to ensure it's visible
+    echo "  Importing dashboard via Grafana API..."
+    GRAFANA_USER="${GRAFANA_USER:-admin}"
+    GRAFANA_PASS="${GRAFANA_PASS:-admin123}"
+    GRAFANA_POD=$(kubectl get pod -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+    
+    if [ -n "${GRAFANA_POD}" ]; then
+        # Wait a moment for Grafana to be ready
+        sleep 5
+        
+        # Extract dashboard JSON from ConfigMap
+        DASHBOARD_JSON_CONTENT=$(kubectl get configmap -n monitoring fluss-flink-dashboard -o jsonpath='{.data.fluss-flink-dashboard\.json}' 2>/dev/null || echo "")
+        
+        if [ -n "${DASHBOARD_JSON_CONTENT}" ]; then
+            # Prepare dashboard payload (ensure overwrite is set)
+            if command -v jq &> /dev/null; then
+                DASHBOARD_PAYLOAD=$(echo "${DASHBOARD_JSON_CONTENT}" | jq '. + {overwrite: true}' 2>/dev/null || echo "${DASHBOARD_JSON_CONTENT}")
+            else
+                DASHBOARD_PAYLOAD="${DASHBOARD_JSON_CONTENT}"
+            fi
+            
+            # Import via Grafana API
+            IMPORT_RESPONSE=$(kubectl exec -n monitoring "${GRAFANA_POD}" -c grafana -- curl -s -X POST \
+                "http://localhost:3000/api/dashboards/db" \
+                -H "Content-Type: application/json" \
+                -u "${GRAFANA_USER}:${GRAFANA_PASS}" \
+                -d "${DASHBOARD_PAYLOAD}" 2>/dev/null || echo "")
+            
+            if echo "${IMPORT_RESPONSE}" | grep -q '"status":"success"'; then
+                echo "  ✓ Dashboard imported successfully via Grafana API!"
+            else
+                echo "  ⚠ Dashboard import via API failed (may need manual import)"
+                echo "  Dashboard ConfigMap is deployed, Grafana should auto-discover it"
+            fi
+        else
+            echo "  ⚠ Could not extract dashboard JSON from ConfigMap"
+        fi
+    else
+        echo "  ⚠ Grafana pod not found, skipping API import"
+        echo "  Dashboard ConfigMap is deployed, Grafana should auto-discover it"
+    fi
+else
+    echo "  No Grafana dashboard YAML found, skipping..."
+fi
+
+# 8. Wait for components to be ready
+echo "[8/8] Waiting for components to be ready..."
+echo "  Waiting for Flink JobManager..."
+kubectl wait --for=condition=ready pod -l app=flink,component=jobmanager -n ${NAMESPACE} --timeout=300s || true
+echo "  Waiting for Flink TaskManagers..."
+kubectl wait --for=condition=ready pod -l app=flink,component=taskmanager -n ${NAMESPACE} --timeout=300s || true
+
+echo ""
+echo "=== Deployment Complete ==="
+echo ""
+echo "Check status:"
+echo "  kubectl get pods -n ${NAMESPACE}"
+echo "  kubectl get pods -n monitoring"
+echo ""
+echo "Check Flink cluster:"
+echo "  kubectl get pods -n ${NAMESPACE} -l app=flink"
+echo "  kubectl get nodes -l flink-component"
+echo ""
+echo "Check monitoring:"
+echo "  kubectl get servicemonitor -n ${NAMESPACE}"
+echo "  kubectl get podmonitor -n ${NAMESPACE}"
+echo ""
+echo "Access Flink Web UI:"
+echo "  kubectl port-forward -n ${NAMESPACE} svc/flink-jobmanager 8081:8081"
+echo "  Then open: http://localhost:8081"
+echo ""
+echo "Access Grafana:"
+echo "  GRAFANA_SVC=\$(kubectl get svc -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')"
+echo "  kubectl port-forward -n monitoring svc/\$GRAFANA_SVC 3000:80"
+echo "  Then open: http://localhost:3000"
+echo "  Username: admin"
+echo "  Password: admin123"
+echo ""
+echo "Access Prometheus:"
+echo "  PROM_SVC=\$(kubectl get svc -n monitoring -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].metadata.name}')"
+echo "  kubectl port-forward -n monitoring svc/\$PROM_SVC 9090:9090"
+echo "  Then open: http://localhost:9090"
+echo ""
+echo "Submit Flink aggregator job manually:"
+echo "  cd ${K8S_DIR}/flink && ./submit-job-local.sh"
+
diff --git a/e2e-iot/high-infra/k8s/flink/Dockerfile b/e2e-iot/high-infra/k8s/flink/Dockerfile
new file mode 100644
index 0000000..73261d5
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/Dockerfile
@@ -0,0 +1,77 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Dockerfile for Fluss Flink Demo Job
+# Builds the JAR and includes it in Flink image for local:// submission
+
+# Stage 1: Build the JAR using Maven
+FROM --platform=linux/amd64 maven:3.9-eclipse-temurin-17 AS builder
+
+WORKDIR /build
+
+# Copy POM files
+COPY ../../../demos/demo/fluss_flink_realtime_demo/pom.xml .
+COPY ../../../demos/demo/fluss_flink_realtime_demo/fluss_flink_realtime_demo/pom.xml fluss_flink_realtime_demo/ 2>/dev/null || true
+
+# Download dependencies (cached layer)
+RUN mvn -B dependency:go-offline -f pom.xml || true
+
+# Copy source code
+COPY ../../../demos/demo/fluss_flink_realtime_demo/src fluss_flink_realtime_demo/src 2>/dev/null || \
+     COPY ../../../demos/demo/fluss_flink_realtime_demo/src src
+
+# Build the JAR with all dependencies
+RUN mvn clean package -DskipTests -f pom.xml || \
+    (cd ../../../demos/demo/fluss_flink_realtime_demo && mvn clean package -DskipTests)
+
+# Find the built JAR
+RUN find . -name "*.jar" -not -name "*-sources.jar" -not -name "*-javadoc.jar" | head -1 > /tmp/jar_path.txt
+
+# Stage 2: Create runtime image with Flink
+FROM --platform=linux/amd64 apache/flink:1.20.3-scala_2.12-java17
+
+# Ensure Prometheus metrics reporter is in plugins directory (required for Flink 1.20.3)
+# The base image should already have it, but we'll verify and ensure it's properly configured
+RUN mkdir -p /opt/flink/plugins/metrics-prometheus && \
+    if [ ! -f /opt/flink/plugins/metrics-prometheus/flink-metrics-prometheus-1.20.3.jar ]; then \
+        curl -L https://repo1.maven.org/maven2/org/apache/flink/flink-metrics-prometheus/1.20.3/flink-metrics-prometheus-1.20.3.jar \
+            -o /opt/flink/plugins/metrics-prometheus/flink-metrics-prometheus-1.20.3.jar && \
+        chmod 644 /opt/flink/plugins/metrics-prometheus/flink-metrics-prometheus-1.20.3.jar; \
+    fi
+
+# Set working directory
+WORKDIR /opt/flink/usrlib
+
+# Copy the built JAR from builder stage
+# Try to find JAR in standard locations
+COPY --from=builder /build/target/fluss-flink-realtime-demo.jar /opt/flink/usrlib/fluss-flink-realtime-demo.jar 2>/dev/null || \
+COPY --from=builder /build/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar /opt/flink/usrlib/fluss-flink-realtime-demo.jar 2>/dev/null || \
+COPY --from=builder /build/$(cat /tmp/jar_path.txt 2>/dev/null || echo "target/fluss-flink-realtime-demo.jar") /opt/flink/usrlib/fluss-flink-realtime-demo.jar
+
+# Verify the JAR is present
+RUN ls -lh /opt/flink/usrlib/fluss-flink-realtime-demo.jar || \
+    (echo "ERROR: JAR file not found" && exit 1)
+
+# Metadata
+LABEL maintainer="Fluss Team"
+LABEL description="Fluss Flink Realtime Demo - JAR embedded for local:// submission"
+LABEL version="0.1.0"
+LABEL flink.version="1.20.3"
+
+# The Flink cluster will use this image, and jobs can reference the JAR via local://
+# No ENTRYPOINT needed - Flink deployments handle the lifecycle
+
diff --git a/e2e-iot/high-infra/k8s/flink/Dockerfile.simple b/e2e-iot/high-infra/k8s/flink/Dockerfile.simple
new file mode 100644
index 0000000..bc8e7eb
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/Dockerfile.simple
@@ -0,0 +1,30 @@
+# Simplified Dockerfile - assumes JAR is already built
+# Build with: docker build --build-arg JAR_PATH=/path/to/jar -f Dockerfile.simple .
+
+FROM apache/flink:1.20.3-scala_2.12-java17
+
+# Ensure Prometheus metrics reporter is in plugins directory (required for Flink 1.20.3)
+# The base image should already have it, but we'll verify and ensure it's properly configured
+RUN mkdir -p /opt/flink/plugins/metrics-prometheus && \
+    if [ ! -f /opt/flink/plugins/metrics-prometheus/flink-metrics-prometheus-1.20.3.jar ]; then \
+        curl -L https://repo1.maven.org/maven2/org/apache/flink/flink-metrics-prometheus/1.20.3/flink-metrics-prometheus-1.20.3.jar \
+            -o /opt/flink/plugins/metrics-prometheus/flink-metrics-prometheus-1.20.3.jar && \
+        chmod 644 /opt/flink/plugins/metrics-prometheus/flink-metrics-prometheus-1.20.3.jar; \
+    fi
+
+# Set working directory
+WORKDIR /opt/flink/usrlib
+
+# Copy JAR from build context
+COPY fluss-flink-realtime-demo.jar /opt/flink/usrlib/fluss-flink-realtime-demo.jar
+
+# Verify the JAR is present
+RUN ls -lh /opt/flink/usrlib/fluss-flink-realtime-demo.jar || \
+    (echo "ERROR: JAR file not found" && exit 1)
+
+# Metadata
+LABEL maintainer="Fluss Team"
+LABEL description="Fluss Flink Realtime Demo - JAR embedded for local:// submission"
+LABEL version="0.1.0"
+LABEL flink.version="1.20.3"
+
diff --git a/e2e-iot/high-infra/k8s/flink/README-DOCKER-IMAGE.md b/e2e-iot/high-infra/k8s/flink/README-DOCKER-IMAGE.md
new file mode 100644
index 0000000..4afd40e
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/README-DOCKER-IMAGE.md
@@ -0,0 +1,81 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Flink Job with Embedded JAR
+
+This approach embeds the Flink job JAR directly in the Docker image, similar to the pattern used in `/Users/vijayabhaskarv/IOT/datapipeline-0/Flink-Benchmark/low_infra_flink/flink-load`.
+
+## Workflow
+
+1. **Build and Push Image**: The JAR is built and embedded in a Flink Docker image, then pushed to ECR
+2. **Deploy Flink Cluster**: Flink JobManager and TaskManager use the custom image (which contains the JAR)
+3. **Submit Job**: Use Flink CLI with `local://` path to reference the JAR in the image
+
+## Files
+
+- `Dockerfile.simple` - Dockerfile that embeds JAR in Flink image
+- `build-and-push.sh` - Script to build JAR, create Docker image, and push to ECR
+- `submit-job-local.sh` - Script to submit job using Flink CLI with `local://` path
+- `flink-jobmanager.yaml` - Updated to use custom image
+- `flink-taskmanager.yaml` - Updated to use custom image
+
+## Usage
+
+### 1. Build and Push Image
+
+```bash
+cd aws-deploy-fluss/low-infra/k8s/flink
+./build-and-push.sh
+```
+
+This will:
+- Build the JAR from `demos/demo/fluss_flink_realtime_demo`
+- Create Docker image with JAR at `/opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+- Push to ECR: `343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo:latest`
+
+### 2. Deploy/Update Flink Cluster
+
+The Flink deployments are already configured to use the custom image:
+
+```bash
+kubectl apply -f flink-jobmanager.yaml
+kubectl apply -f flink-taskmanager.yaml
+```
+
+### 3. Submit Job
+
+```bash
+./submit-job-local.sh
+```
+
+This uses Flink CLI to submit the job with `local:///opt/flink/usrlib/fluss-flink-realtime-demo.jar`
+
+## Benefits
+
+- ✅ No manual JAR upload needed
+- ✅ JAR is versioned with Docker image
+- ✅ Cleaner than REST API approach
+- ✅ Works with standard Flink deployments (no operator needed)
+- ✅ JAR is always available in the cluster
+
+## Differences from Operator Approach
+
+- Uses standard Flink deployments (Deployment/StatefulSet) instead of FlinkDeployment CRD
+- Job submission is manual via Flink CLI, not automatic
+- More control over when jobs are submitted
+
diff --git a/e2e-iot/high-infra/k8s/flink/README.md b/e2e-iot/high-infra/k8s/flink/README.md
new file mode 100644
index 0000000..99fc569
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/README.md
@@ -0,0 +1,113 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Flink Cluster Deployment
+
+This directory contains Kubernetes manifests for deploying a Flink cluster with proper node placement.
+
+## Architecture
+
+- **1 JobManager**: Runs on a dedicated node labeled `flink-component=jobmanager`
+- **2 TaskManagers**: Each runs on a separate node labeled `flink-component=taskmanager`
+- **Node Affinity**: Ensures pods are scheduled on the correct node types
+- **Pod Anti-Affinity**: Ensures TaskManagers are distributed across different nodes
+
+## Node Groups
+
+The Flink cluster requires dedicated node groups created via Terraform:
+
+1. **flink-jobmanager**: 1 node (t3.medium)
+   - Label: `flink-component=jobmanager`
+   - Taint: `flink-component=jobmanager:NoSchedule`
+
+2. **flink-taskmanager**: 2 nodes (t3.medium each)
+   - Label: `flink-component=taskmanager`
+   - Taint: `flink-component=taskmanager:NoSchedule`
+
+## Deployment Order
+
+1. **Deploy Flink ConfigMap:**
+   ```bash
+   kubectl apply -f flink-config.yaml
+   ```
+
+2. **Deploy JobManager:**
+   ```bash
+   kubectl apply -f flink-jobmanager.yaml
+   ```
+
+3. **Deploy TaskManagers:**
+   ```bash
+   kubectl apply -f flink-taskmanager.yaml
+   ```
+
+4. **Verify deployment:**
+   ```bash
+   kubectl get pods -n fluss -l app=flink
+   kubectl get nodes -l flink-component
+   ```
+
+5. **Submit Flink job:**
+   ```bash
+   export DEMO_IMAGE_REPO="<your-image-repo>"
+   export DEMO_IMAGE_TAG="<your-image-tag>"
+   envsubst < flink-job-submission-simple.yaml | kubectl apply -f -
+   ```
+
+## Verifying Node Placement
+
+```bash
+# Check which nodes Flink pods are running on
+kubectl get pods -n fluss -l app=flink -o wide
+
+# Verify JobManager is on jobmanager node
+kubectl get pod -n fluss -l component=jobmanager -o jsonpath='{.items[0].spec.nodeName}'
+kubectl get node $(kubectl get pod -n fluss -l component=jobmanager -o jsonpath='{.items[0].spec.nodeName}') --show-labels
+
+# Verify TaskManagers are on different nodes
+kubectl get pods -n fluss -l component=taskmanager -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}{end}'
+```
+
+## Accessing Flink Web UI
+
+```bash
+kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081
+```
+
+Then open http://localhost:8081
+
+## Job Submission
+
+The Flink job is submitted via REST API using the `flink-job-submission-simple.yaml` job. This job:
+
+1. Waits for Flink JobManager to be ready
+2. Waits for Fluss coordinator to be ready
+3. Waits for producer to create the database
+4. Uploads the JAR to Flink cluster
+5. Submits the job with proper arguments
+
+Check job submission status:
+```bash
+kubectl logs -n fluss -l app=flink-job-submission --tail=50
+```
+
+Check running jobs in Flink Web UI or via REST API:
+```bash
+curl http://localhost:8081/jobs
+```
+
diff --git a/e2e-iot/high-infra/k8s/flink/build-and-push.sh b/e2e-iot/high-infra/k8s/flink/build-and-push.sh
new file mode 100755
index 0000000..a0bdce3
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/build-and-push.sh
@@ -0,0 +1,230 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -e
+
+# Build and Push Fluss Flink Demo Docker Image to AWS ECR
+# This script builds the Flink demo JAR and pushes it to ECR
+
+echo "======================================"
+echo "Fluss Flink Demo - Build and Push to ECR"
+echo "======================================"
+echo ""
+
+# Color codes
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+# Configuration
+AWS_REGION=${AWS_REGION:-us-west-2}
+# Try to get AWS account ID from AWS CLI if not set
+if [ -z "${AWS_ACCOUNT_ID:-}" ]; then
+    AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo "")
+    if [ -z "${AWS_ACCOUNT_ID}" ]; then
+        echo "ERROR: AWS_ACCOUNT_ID is not set and could not be determined from AWS CLI"
+        echo "Please set it with: export AWS_ACCOUNT_ID=your-account-id"
+        exit 1
+    fi
+fi
+ECR_REPOSITORY="fluss-demo"
+IMAGE_TAG=${IMAGE_TAG:-latest}
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Go up from k8s/flink to 2-million-messages-per-second directory
+# k8s/flink -> k8s -> high-infra -> 2-million-messages-per-second
+DEMO_BASE_DIR="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
+
+echo -e "${YELLOW}Configuration:${NC}"
+echo "  AWS Region: ${AWS_REGION}"
+echo "  AWS Account: ${AWS_ACCOUNT_ID}"
+echo "  ECR Repository: ${ECR_REPOSITORY}"
+echo "  Image Tag: ${IMAGE_TAG}"
+echo "  Platform: linux/amd64"
+echo "  Demo Base Dir: ${DEMO_BASE_DIR}"
+echo ""
+
+# Check prerequisites
+echo -e "${YELLOW}Checking prerequisites...${NC}"
+
+if ! command -v docker &> /dev/null; then
+    echo -e "${RED}❌ Docker is not installed or not running${NC}"
+    exit 1
+fi
+
+if ! docker info &> /dev/null; then
+    echo -e "${RED}❌ Docker daemon is not running${NC}"
+    exit 1
+fi
+
+if ! command -v aws &> /dev/null; then
+    echo -e "${RED}❌ AWS CLI is not installed${NC}"
+    exit 1
+fi
+
+echo -e "${GREEN}✅ All prerequisites met${NC}"
+echo ""
+
+# Check if demo source exists
+DEMO_DIR="${DEMO_BASE_DIR}/fluss_flink_realtime"
+if [ ! -d "${DEMO_DIR}" ]; then
+    echo -e "${RED}❌ Demo directory not found: ${DEMO_DIR}${NC}"
+    exit 1
+fi
+
+# Find the JAR file (may have version suffix)
+JAR_FILE=$(find "${DEMO_DIR}/target" -name "fluss-flink-realtime-demo*.jar" -type f | head -1)
+
+# Check if JAR exists or needs to be built
+if [ -z "${JAR_FILE}" ] || [ ! -f "${JAR_FILE}" ]; then
+    echo -e "${YELLOW}JAR not found, building it first...${NC}"
+    cd "${DEMO_DIR}"
+    mvn clean package -DskipTests
+    cd "${SCRIPT_DIR}"
+    JAR_FILE=$(find "${DEMO_DIR}/target" -name "fluss-flink-realtime-demo*.jar" -type f | head -1)
+fi
+
+if [ -z "${JAR_FILE}" ] || [ ! -f "${JAR_FILE}" ]; then
+    echo -e "${RED}❌ JAR file not found after build${NC}"
+    exit 1
+fi
+
+echo -e "${GREEN}✅ JAR file ready: ${JAR_FILE}${NC}"
+echo ""
+
+# Build the Docker image
+echo -e "${YELLOW}Building Docker image for linux/amd64...${NC}"
+echo "This may take 5-10 minutes..."
+echo ""
+
+cd "${SCRIPT_DIR}"
+
+# Copy JAR to build context (rename to standard name)
+cp "${JAR_FILE}" ./fluss-flink-realtime-demo.jar
+
+# Build using the simple Dockerfile with buildx for cross-platform
+BUILDER_NAME="fluss-multiplatform"
+echo "Setting up Docker buildx for cross-platform build..."
+if ! docker buildx inspect "${BUILDER_NAME}" &>/dev/null; then
+    echo "Creating buildx builder..."
+    docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use --bootstrap 2>/dev/null || true
+else
+    docker buildx use "${BUILDER_NAME}" 2>/dev/null || true
+fi
+
+# Capture timestamp once to use consistently
+TIMESTAMP=$(date +%Y%m%d-%H%M%S)
+
+echo "Building Flink image with embedded JAR..."
+docker buildx build --builder "${BUILDER_NAME}" --platform linux/amd64 --load \
+  -t ${ECR_REPOSITORY}:${IMAGE_TAG} \
+  -t ${ECR_REPOSITORY}:${TIMESTAMP} \
+  -f Dockerfile.simple \
+  .
+
+# Cleanup
+rm -f ./fluss-flink-realtime-demo.jar
+
+if [ $? -eq 0 ]; then
+    echo -e "${GREEN}✅ Docker image built successfully${NC}"
+else
+    echo -e "${RED}❌ Docker build failed${NC}"
+    exit 1
+fi
+
+echo ""
+
+# Get ECR login token
+echo -e "${YELLOW}Authenticating with ECR...${NC}"
+aws ecr get-login-password --region ${AWS_REGION} | \
+  docker login --username AWS --password-stdin ${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com
+
+if [ $? -eq 0 ]; then
+    echo -e "${GREEN}✅ ECR authentication successful${NC}"
+else
+    echo -e "${RED}❌ ECR authentication failed${NC}"
+    exit 1
+fi
+
+echo ""
+
+# Check if ECR repository exists
+echo -e "${YELLOW}Checking if ECR repository exists...${NC}"
+if ! aws ecr describe-repositories --repository-names ${ECR_REPOSITORY} --region ${AWS_REGION} &> /dev/null; then
+    echo -e "${YELLOW}⚠️  ECR repository '${ECR_REPOSITORY}' does not exist${NC}"
+    echo -e "${YELLOW}Creating ECR repository...${NC}"
+    
+    aws ecr create-repository \
+      --repository-name ${ECR_REPOSITORY} \
+      --region ${AWS_REGION} \
+      --image-scanning-configuration scanOnPush=true
+    
+    echo -e "${GREEN}✅ ECR repository created${NC}"
+else
+    echo -e "${GREEN}✅ ECR repository exists${NC}"
+fi
+
+echo ""
+
+# Tag image for ECR
+ECR_IMAGE="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY}"
+echo -e "${YELLOW}Tagging image for ECR...${NC}"
+docker tag ${ECR_REPOSITORY}:${IMAGE_TAG} ${ECR_IMAGE}:${IMAGE_TAG}
+docker tag ${ECR_REPOSITORY}:${TIMESTAMP} ${ECR_IMAGE}:${TIMESTAMP}
+# Also tag as latest (matching reference repository)
+if [ "${IMAGE_TAG}" != "latest" ]; then
+    docker tag ${ECR_REPOSITORY}:${IMAGE_TAG} ${ECR_IMAGE}:latest
+fi
+
+echo -e "${GREEN}✅ Image tagged${NC}"
+echo ""
+
+# Push to ECR
+echo -e "${YELLOW}Pushing image to ECR...${NC}"
+docker push ${ECR_IMAGE}:${IMAGE_TAG}
+docker push ${ECR_IMAGE}:${TIMESTAMP}
+# Also push as latest if tagged
+if [ "${IMAGE_TAG}" != "latest" ]; then
+    docker push ${ECR_IMAGE}:latest
+fi
+
+if [ $? -eq 0 ]; then
+    echo -e "${GREEN}✅ Image pushed successfully${NC}"
+else
+    echo -e "${RED}❌ Image push failed${NC}"
+    exit 1
+fi
+
+echo ""
+echo -e "${GREEN}======================================"
+echo "✅ Build and Push Complete!"
+echo "======================================${NC}"
+echo ""
+
+echo -e "${YELLOW}Image Details:${NC}"
+echo "  Repository: ${ECR_IMAGE}"
+echo "  Tags: ${IMAGE_TAG}, ${TIMESTAMP}"
+echo ""
+
+echo -e "${YELLOW}Next steps:${NC}"
+echo "  1. Update flink-jobmanager.yaml and flink-taskmanager.yaml to use:"
+echo "     image: ${ECR_IMAGE}:${IMAGE_TAG}"
+echo "  2. Apply the updated deployments"
+echo "  3. Submit job using Flink CLI with local:// path"
+echo ""
+
diff --git a/e2e-iot/high-infra/k8s/flink/flink-config.yaml b/e2e-iot/high-infra/k8s/flink/flink-config.yaml
new file mode 100644
index 0000000..e204d86
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/flink-config.yaml
@@ -0,0 +1,105 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: flink-config
+  namespace: ${NAMESPACE}
+data:
+  flink-conf.yaml: |
+    # Flink Configuration with Prometheus Metrics Reporter
+    # ======================================================
+    
+    # JobManager Configuration
+    jobmanager.rpc.address: flink-jobmanager
+    jobmanager.rpc.port: 6123
+    jobmanager.memory.process.size: 1600m
+    
+    # TaskManager Configuration
+    taskmanager.memory.process.size: 24g
+    taskmanager.memory.task.off-heap.size: 4g
+    taskmanager.memory.framework.off-heap.size: 2g
+    taskmanager.memory.network.fraction: 0.2
+    taskmanager.memory.network.min: 1gb
+    taskmanager.memory.network.max: 4gb
+    taskmanager.numberOfTaskSlots: 32
+    parallelism.default: 192
+    
+    # Blob and Query Server Ports
+    blob.server.port: 6124
+    queryable-state.server.ports: 6125
+    
+    # Metrics Configuration
+    # =====================
+    
+    # Enable metrics reporters
+    metrics.reporters: prometheus
+    
+    # Prometheus Reporter Configuration (Flink 1.20.3 uses factory.class)
+    metrics.reporter.prometheus.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory
+    metrics.reporter.prometheus.port: 9249-9259
+    metrics.reporter.prometheus.filterLabelValueCharacters: false
+    
+    # System Resource Metrics
+    metrics.system-resource: true
+    metrics.system-resource-probing-interval: 5000
+    
+    # Latency Tracking
+    metrics.latency.interval: 10000
+    metrics.latency.granularity: operator
+    metrics.latency.history-size: 128
+    
+    # Web Frontend Configuration
+    web.submit.enable: true
+    web.cancel.enable: true
+    web.tmpdir: /tmp/flink-web
+    
+    # Watermark configuration to prevent watermark stalling
+    # This helps with low watermark issues when some partitions are idle
+    pipeline.auto-watermark-interval: 200ms
+    
+    # Network Configuration for High Throughput (2M+ messages/sec)
+    # ============================================================
+    # Network buffer configuration for high-throughput scenarios
+    # Memory allocation is handled by taskmanager.memory.network.* above
+    taskmanager.network.memory.buffers-per-channel: 2
+    taskmanager.network.memory.floating-buffers-per-gate: 8
+    
+    # Network buffer timeout - increase for high throughput
+    taskmanager.network.request-backoff.max: 10000
+    taskmanager.network.request-backoff.initial: 100
+    
+    # Network stack optimizations
+    taskmanager.network.detailed-metrics: false
+    
+    # Checkpointing Configuration - DISABLED for performance
+    # Checkpointing disabled to reduce overhead and improve throughput
+    # NOTE: This disables fault tolerance - job will restart from beginning on failure
+    state.backend: rocksdb
+    state.backend.incremental: true
+    # Checkpointing disabled
+    # execution.checkpointing.interval: disabled
+    # execution.checkpointing.mode: disabled
+    
+    # S3 Configuration - Force IRSA credential provider (matches reference config)
+    fs.s3a.aws.credentials.provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider
+    s3.path.style.access: "false"
+    
+    # Logging
+    taskmanager.log.path: /opt/flink/log
+    jobmanager.log.path: /opt/flink/log
diff --git a/e2e-iot/high-infra/k8s/flink/flink-job-submission-simple.yaml b/e2e-iot/high-infra/k8s/flink/flink-job-submission-simple.yaml
new file mode 100644
index 0000000..2015622
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/flink-job-submission-simple.yaml
@@ -0,0 +1,233 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Simplified Flink Job Submission
+# This approach uses a sidecar container with the demo image to submit the job
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: flink-job-submitter
+  namespace: ${NAMESPACE}
+  labels:
+    app: flink-job-submission
+spec:
+  backoffLimit: 3
+  completions: 1
+  parallelism: 1
+  ttlSecondsAfterFinished: 3600
+  template:
+    metadata:
+      labels:
+        app: flink-job-submission
+    spec:
+      restartPolicy: OnFailure
+      initContainers:
+        # Wait for Flink JobManager to be ready
+        - name: wait-for-flink
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              echo "Waiting for Flink JobManager to be ready..."
+              MAX_ATTEMPTS=60
+              ATTEMPT=0
+              while [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do
+                if nc -zv -w 2 flink-jobmanager.${NAMESPACE}.svc.cluster.local 8081 2>&1 | grep -q "open\|succeeded"; then
+                  echo "Flink JobManager is ready!"
+                  exit 0
+                fi
+                ATTEMPT=$((ATTEMPT + 1))
+                echo "Waiting for Flink JobManager... (attempt $ATTEMPT/$MAX_ATTEMPTS)"
+                sleep 2
+              done
+              echo "ERROR: Flink JobManager did not become ready after $MAX_ATTEMPTS attempts"
+              exit 1
+        # Wait for Fluss coordinator
+        - name: wait-for-fluss
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              echo "Waiting for Fluss coordinator to be ready..."
+              COORD_HOST="coordinator-server-hs.${NAMESPACE}.svc.cluster.local"
+              # Check if the port is open using nc
+              until nc -zv "$COORD_HOST" 9124 2>&1 | grep -q "open"; do
+                echo "Waiting for Fluss coordinator on port 9124..."
+                sleep 2
+              done
+              echo "Fluss coordinator is ready!"
+        # Wait for producer to create database
+        - name: wait-for-producer-database
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              echo "Waiting for producer to start and create database 'iot'..."
+              echo "This init container waits 30 seconds to give the producer time to create the database"
+              sleep 30
+              echo "Proceeding - producer should have created the database by now"
+              exit 0
+        # Copy JAR from demo image to shared volume
+        - name: copy-jar
+          image: ${DEMO_IMAGE_REPO}:${DEMO_IMAGE_TAG}
+          command:
+            - sh
+            - -c
+            - |
+              echo "Copying JAR file to shared volume..."
+              if [ -f /opt/flink/usrlib/fluss-flink-realtime-demo.jar ]; then
+                cp /opt/flink/usrlib/fluss-flink-realtime-demo.jar /shared/fluss-flink-realtime-demo.jar
+                ls -lh /shared/
+                echo "JAR copied successfully"
+              else
+                echo "ERROR: JAR file not found in /opt/flink/usrlib/fluss-flink-realtime-demo.jar"
+                exit 1
+              fi
+          volumeMounts:
+            - name: shared-jar
+              mountPath: /shared
+      volumes:
+        - name: shared-jar
+          emptyDir: {}
+      containers:
+        - name: job-submitter
+          image: curlimages/curl:latest
+          securityContext:
+            runAsUser: 0
+          command:
+            - sh
+            - -c
+            - |
+              set -e
+              
+              echo "Submitting Flink job to cluster..."
+              
+              # Install jq (curl is already available in curlimages/curl)
+              if ! command -v jq &> /dev/null; then
+                echo "Installing jq..."
+                apk add --no-cache jq 2>/dev/null || \
+                echo "WARNING: Could not install jq, will use grep/sed instead"
+              fi
+              
+              JOBMANAGER="flink-jobmanager.${NAMESPACE}.svc.cluster.local:8081"
+              JAR_PATH="/shared/fluss-flink-realtime-demo.jar"
+              
+              # Check if JAR exists
+              if [ ! -f "$JAR_PATH" ]; then
+                echo "ERROR: JAR file not found at $JAR_PATH"
+                exit 1
+              fi
+              if [ ! -f "$JAR_PATH" ]; then
+                echo "ERROR: JAR file not found at $JAR_PATH"
+                exit 1
+              fi
+              
+              # Upload JAR to Flink cluster
+              echo "Uploading JAR to Flink cluster..."
+              UPLOAD_RESPONSE=$(curl -s -X POST \
+                "http://${JOBMANAGER}/v1/jars/upload" \
+                -H "Content-Type: multipart/form-data" \
+                -F "jarfile=@${JAR_PATH}")
+              
+              echo "Upload response: $UPLOAD_RESPONSE"
+              
+              # Extract JAR ID from response (use jq if available, otherwise grep/sed)
+              if command -v jq &> /dev/null; then
+                JAR_ID=$(echo "$UPLOAD_RESPONSE" | jq -r '.filename' | sed 's|.*/||')
+              else
+                # Fallback: extract filename from JSON using grep/sed
+                JAR_ID=$(echo "$UPLOAD_RESPONSE" | grep -o '"filename":"[^"]*' | sed 's/"filename":"//' | sed 's|.*/||')
+              fi
+              
+              if [ -z "$JAR_ID" ] || [ "$JAR_ID" = "null" ]; then
+                echo "ERROR: Failed to upload JAR"
+                echo "Response: $UPLOAD_RESPONSE"
+                exit 1
+              fi
+              
+              echo "JAR uploaded with ID: $JAR_ID"
+              
+              # Submit job
+              echo "Submitting job..."
+              JOB_RESPONSE=$(curl -s -X POST \
+                "http://${JOBMANAGER}/v1/jars/${JAR_ID}/run" \
+                -H "Content-Type: application/json" \
+                -d '{
+                  "entryClass": "org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob",
+                  "programArgs": "--bootstrap coordinator-server-hs.${NAMESPACE}.svc.cluster.local:9124 --database iot --table sensor_readings --window-minutes 1",
+                  "parallelism": 2,
+                  "savepointPath": null,
+                  "allowNonRestoredState": false
+                }')
+              
+              echo "Job submission response: $JOB_RESPONSE"
+              
+              # Extract Job ID from response
+              if command -v jq &> /dev/null; then
+                JOB_ID=$(echo "$JOB_RESPONSE" | jq -r '.jobid')
+              else
+                # Fallback: extract jobid from JSON using grep/sed
+                JOB_ID=$(echo "$JOB_RESPONSE" | grep -o '"jobid":"[^"]*' | sed 's/"jobid":"//')
+              fi
+              
+              if [ -z "$JOB_ID" ] || [ "$JOB_ID" = "null" ]; then
+                echo "ERROR: Failed to submit job"
+                echo "Response: $JOB_RESPONSE"
+                exit 1
+              fi
+              
+              echo "Job submitted successfully!"
+              echo "Job ID: $JOB_ID"
+              echo "Job status URL: http://${JOBMANAGER}/#/job/${JOB_ID}"
+              
+              # Wait a bit to ensure job started
+              sleep 10
+              
+              # Check job status
+              if command -v jq &> /dev/null; then
+                JOB_STATUS=$(curl -s "http://${JOBMANAGER}/jobs/${JOB_ID}" | jq -r '.state // "UNKNOWN"')
+              else
+                # Fallback: extract state from JSON
+                JOB_STATUS=$(curl -s "http://${JOBMANAGER}/jobs/${JOB_ID}" | grep -o '"state":"[^"]*' | sed 's/"state":"//' || echo "UNKNOWN")
+              fi
+              echo "Job status: $JOB_STATUS"
+              
+              if [ "$JOB_STATUS" = "FAILED" ] || [ "$JOB_STATUS" = "CANCELED" ]; then
+                echo "ERROR: Job failed or was canceled"
+                exit 1
+              fi
+              
+              echo "Job submission completed successfully!"
+              echo "Job is running on Flink cluster. Check Flink Web UI for details."
+              
+              # Keep container running to show job is submitted (optional)
+              # In production, you might want the job to exit here
+              sleep 3600
+          volumeMounts:
+            - name: shared-jar
+              mountPath: /shared
+          resources:
+            requests:
+              memory: "512Mi"
+              cpu: "200m"
+            limits:
+              memory: "1Gi"
+              cpu: "500m"
+
diff --git a/e2e-iot/high-infra/k8s/flink/flink-job-submission.yaml b/e2e-iot/high-infra/k8s/flink/flink-job-submission.yaml
new file mode 100644
index 0000000..4caa955
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/flink-job-submission.yaml
@@ -0,0 +1,216 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: flink-job-jar
+  namespace: fluss
+  labels:
+    app: flink-job-submission
+binaryData:
+  # This will be populated by copying the JAR file
+  # To populate: kubectl create configmap flink-job-jar --from-file=fluss-flink-realtime-demo.jar=/path/to/jar --dry-run=client -o yaml | kubectl apply -f -
+  # Or use init container to download from S3/ECR/URL
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: flink-job-submitter
+  namespace: fluss
+  labels:
+    app: flink-job-submission
+spec:
+  backoffLimit: 3
+  completions: 1
+  parallelism: 1
+  ttlSecondsAfterFinished: 3600
+  template:
+    metadata:
+      labels:
+        app: flink-job-submission
+    spec:
+      restartPolicy: OnFailure
+      initContainers:
+        # Wait for Flink JobManager to be ready
+        - name: wait-for-flink
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              echo "Waiting for Flink JobManager to be ready..."
+              MAX_ATTEMPTS=60
+              ATTEMPT=0
+              while [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do
+                if nc -zv -w 2 flink-jobmanager.fluss.svc.cluster.local 8081 2>&1 | grep -q "open\|succeeded"; then
+                  echo "Flink JobManager is ready!"
+                  exit 0
+                fi
+                ATTEMPT=$((ATTEMPT + 1))
+                echo "Waiting for Flink JobManager... (attempt $ATTEMPT/$MAX_ATTEMPTS)"
+                sleep 2
+              done
+              echo "ERROR: Flink JobManager did not become ready after $MAX_ATTEMPTS attempts"
+              exit 1
+        # Wait for Fluss coordinator
+        - name: wait-for-fluss
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              echo "Waiting for Fluss coordinator to be ready..."
+              COORD_HOST="coordinator-server-hs.fluss.svc.cluster.local"
+              MAX_ATTEMPTS=60
+              ATTEMPT=0
+              while [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do
+                if nc -zv -w 2 "$COORD_HOST" 9124 2>&1 | grep -q "open\|succeeded"; then
+                  echo "Fluss coordinator is ready!"
+                  exit 0
+                fi
+                ATTEMPT=$((ATTEMPT + 1))
+                echo "Waiting for Fluss coordinator... (attempt $ATTEMPT/$MAX_ATTEMPTS)"
+                sleep 2
+              done
+              echo "ERROR: Fluss coordinator did not become ready after $MAX_ATTEMPTS attempts"
+              exit 1
+        # Wait for producer to create database
+        - name: wait-for-producer-database
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              echo "Waiting for producer to start and create database 'iot'..."
+              echo "This init container waits 30 seconds to give the producer time to create the database"
+              sleep 30
+              echo "Proceeding - producer should have created the database by now"
+              exit 0
+        # Download or copy JAR file
+        - name: prepare-job-jar
+          image: curlimages/curl:latest
+          command:
+            - sh
+            - -c
+            - |
+              # Option 1: Download from URL (if JAR is hosted)
+              # curl -L -o /tmp/fluss-flink-realtime-demo.jar "${JAR_URL}"
+              
+              # Option 2: Copy from ConfigMap (if populated)
+              # cp /job-jar/fluss-flink-realtime-demo.jar /tmp/fluss-flink-realtime-demo.jar
+              
+              # Option 3: Download from S3 (if configured)
+              # aws s3 cp s3://bucket/path/to/jar /tmp/fluss-flink-realtime-demo.jar
+              
+              # For now, we'll use a sidecar approach - the JAR should be in the image
+              # Or mounted via volume from ConfigMap
+              echo "JAR preparation complete"
+              ls -lh /tmp/ || true
+          volumeMounts:
+            - name: job-jar
+              mountPath: /tmp
+          env:
+            - name: JAR_URL
+              value: "${JAR_URL:-}"
+      containers:
+        - name: job-submitter
+          image: apache/flink:1.20.3-scala_2.12-java17
+          command:
+            - sh
+            - -c
+            - |
+              set -e
+              
+              echo "Submitting Flink job to cluster..."
+              
+              # Method 1: Use Flink CLI (requires JAR in image or mounted)
+              # /opt/flink/bin/flink run \
+              #   -m flink-jobmanager:6123 \
+              #   -c org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob \
+              #   /tmp/fluss-flink-realtime-demo.jar \
+              #   --bootstrap coordinator-server-hs.fluss.svc.cluster.local:9124 \
+              #   --database iot \
+              #   --table sensor_readings \
+              #   --window-minutes 1
+              
+              # Method 2: Use Flink REST API (more flexible)
+              JOBMANAGER="flink-jobmanager.fluss.svc.cluster.local:8081"
+              
+              # Upload JAR first (if not already in cluster)
+              echo "Uploading JAR to Flink cluster..."
+              JAR_ID=$(curl -s -X POST \
+                "http://${JOBMANAGER}/v1/jars/upload" \
+                -H "Content-Type: multipart/form-data" \
+                -F "jarfile=@/tmp/fluss-flink-realtime-demo.jar" \
+                | jq -r '.filename' | sed 's|.*/||')
+              
+              if [ -z "$JAR_ID" ] || [ "$JAR_ID" = "null" ]; then
+                echo "ERROR: Failed to upload JAR"
+                exit 1
+              fi
+              
+              echo "JAR uploaded with ID: $JAR_ID"
+              
+              # Submit job
+              echo "Submitting job..."
+              JOB_RESPONSE=$(curl -s -X POST \
+                "http://${JOBMANAGER}/v1/jars/${JAR_ID}/run" \
+                -H "Content-Type: application/json" \
+                -d '{
+                  "entryClass": "org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob",
+                  "programArgs": "--bootstrap coordinator-server-hs.fluss.svc.cluster.local:9124 --database iot --table sensor_readings --window-minutes 1",
+                  "parallelism": 2,
+                  "savepointPath": null,
+                  "allowNonRestoredState": false
+                }')
+              
+              JOB_ID=$(echo "$JOB_RESPONSE" | jq -r '.jobid')
+              
+              if [ -z "$JOB_ID" ] || [ "$JOB_ID" = "null" ]; then
+                echo "ERROR: Failed to submit job"
+                echo "Response: $JOB_RESPONSE"
+                exit 1
+              fi
+              
+              echo "Job submitted successfully!"
+              echo "Job ID: $JOB_ID"
+              echo "Job status: http://${JOBMANAGER}/#/job/${JOB_ID}"
+              
+              # Wait a bit to ensure job started
+              sleep 5
+              
+              # Check job status
+              JOB_STATUS=$(curl -s "http://${JOBMANAGER}/jobs/${JOB_ID}" | jq -r '.state')
+              echo "Job status: $JOB_STATUS"
+              
+              if [ "$JOB_STATUS" != "RUNNING" ] && [ "$JOB_STATUS" != "CREATED" ]; then
+                echo "WARNING: Job is not in RUNNING state: $JOB_STATUS"
+                exit 1
+              fi
+              
+              echo "Job submission completed successfully!"
+          volumeMounts:
+            - name: job-jar
+              mountPath: /tmp
+          env:
+            - name: FLINK_CONF_DIR
+              value: "/opt/flink/conf"
+      volumes:
+        - name: job-jar
+          emptyDir: {}
+
diff --git a/e2e-iot/high-infra/k8s/flink/flink-jobmanager.yaml b/e2e-iot/high-infra/k8s/flink/flink-jobmanager.yaml
new file mode 100644
index 0000000..a1276f8
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/flink-jobmanager.yaml
@@ -0,0 +1,192 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: flink-jobmanager
+  namespace: ${NAMESPACE}
+  labels:
+    app: flink
+    component: jobmanager
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: flink
+      component: jobmanager
+  template:
+    metadata:
+      labels:
+        app: flink
+        component: jobmanager
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "9249"
+        prometheus.io/path: "/metrics"
+    spec:
+      serviceAccountName: flink
+      nodeSelector:
+        flink-component: jobmanager
+      tolerations:
+        - key: "flink-component"
+          operator: "Equal"
+          value: "jobmanager"
+          effect: "NoSchedule"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: flink-component
+                    operator: In
+                    values:
+                      - jobmanager
+      initContainers:
+        - name: install-plugins
+          image: apache/flink:1.20.3-scala_2.12-java17
+          imagePullPolicy: IfNotPresent
+          command: ["sh", "-c"]
+          args:
+            - |
+              set -e
+              # Install S3 filesystem plugin
+              mkdir -p /opt/flink/plugins/s3-fs-hadoop
+              if [ -f /opt/flink/opt/flink-s3-fs-hadoop-1.20.3.jar ]; then
+                cp /opt/flink/opt/flink-s3-fs-hadoop-1.20.3.jar /opt/flink/plugins/s3-fs-hadoop/
+                echo "S3 plugin installed"
+              else
+                echo "WARNING: S3 plugin not found in /opt/flink/opt/, skipping"
+              fi
+              
+              # Install Prometheus metrics plugin
+              mkdir -p /opt/flink/plugins/metrics-prometheus
+              if [ -f /opt/flink/opt/flink-metrics-prometheus-1.20.3.jar ]; then
+                cp /opt/flink/opt/flink-metrics-prometheus-1.20.3.jar /opt/flink/plugins/metrics-prometheus/
+                echo "Prometheus plugin copied from /opt/flink/opt/"
+              elif [ -f /opt/flink/lib/flink-metrics-prometheus-1.20.3.jar ]; then
+                cp /opt/flink/lib/flink-metrics-prometheus-1.20.3.jar /opt/flink/plugins/metrics-prometheus/
+                echo "Prometheus plugin copied from /opt/flink/lib/"
+              else
+                echo "Downloading Prometheus metrics plugin..."
+                curl -L https://repo1.maven.org/maven2/org/apache/flink/flink-metrics-prometheus/1.20.3/flink-metrics-prometheus-1.20.3.jar \
+                  -o /opt/flink/plugins/metrics-prometheus/flink-metrics-prometheus-1.20.3.jar || {
+                  echo "ERROR: Failed to download Prometheus plugin"
+                  exit 1
+                }
+                echo "Prometheus plugin downloaded"
+              fi
+              
+              ls -la /opt/flink/plugins/s3-fs-hadoop/ || true
+              ls -la /opt/flink/plugins/metrics-prometheus/
+              echo "All plugins installed successfully"
+          volumeMounts:
+            - name: flink-plugins
+              mountPath: /opt/flink/plugins
+        - name: copy-job-jar
+          image: ${DEMO_IMAGE_REPO}:${DEMO_IMAGE_TAG}
+          imagePullPolicy: Always
+          command: ["sh", "-c"]
+          args:
+            - |
+              set -e
+              echo "Copying Flink job JAR to shared volume..."
+              if [ -f /opt/flink/usrlib/fluss-flink-realtime-demo.jar ]; then
+                cp /opt/flink/usrlib/fluss-flink-realtime-demo.jar /shared/fluss-flink-realtime-demo.jar
+                ls -lh /shared/fluss-flink-realtime-demo.jar
+                echo "✓ JAR copied successfully"
+              else
+                echo "ERROR: JAR file not found in /opt/flink/usrlib/fluss-flink-realtime-demo.jar"
+                exit 1
+              fi
+          volumeMounts:
+            - name: flink-job-jar
+              mountPath: /shared
+      containers:
+        - name: jobmanager
+          image: apache/flink:1.20.3-scala_2.12-java17
+          imagePullPolicy: IfNotPresent
+          command: ["/opt/flink/bin/jobmanager.sh"]
+          args: ["start-foreground"]
+          env:
+            - name: FLINK_PROPERTIES
+              value: "jobmanager.rpc.address: flink-jobmanager"
+            - name: JAVA_TOOL_OPTIONS
+              value: "--add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED"
+          ports:
+            - name: rpc
+              containerPort: 6123
+            - name: blob-server
+              containerPort: 6124
+            - name: queryable-state
+              containerPort: 6125
+            - name: webui
+              containerPort: 8081
+            - name: metrics
+              containerPort: 9249
+          volumeMounts:
+            - name: flink-config
+              mountPath: /opt/flink/conf
+            - name: flink-plugins
+              mountPath: /opt/flink/plugins
+            - name: flink-job-jar
+              mountPath: /opt/flink/usrlib
+          resources:
+            requests:
+              memory: "1.5Gi"
+              cpu: "500m"
+            limits:
+              memory: "2Gi"
+              cpu: "1000m"
+      volumes:
+        - name: flink-config
+          configMap:
+            name: flink-config
+        - name: flink-plugins
+          emptyDir: {}
+        - name: flink-job-jar
+          emptyDir: {}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: flink-jobmanager
+  namespace: ${NAMESPACE}
+  labels:
+    app: flink
+    component: jobmanager
+spec:
+  type: ClusterIP
+  ports:
+    - name: rpc
+      port: 6123
+      targetPort: 6123
+    - name: blob-server
+      port: 6124
+      targetPort: 6124
+    - name: queryable-state
+      port: 6125
+      targetPort: 6125
+    - name: webui
+      port: 8081
+      targetPort: 8081
+    - name: metrics
+      port: 9249
+      targetPort: 9249
+  selector:
+    app: flink
+    component: jobmanager
diff --git a/e2e-iot/high-infra/k8s/flink/flink-serviceaccount.yaml b/e2e-iot/high-infra/k8s/flink/flink-serviceaccount.yaml
new file mode 100644
index 0000000..0d2e744
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/flink-serviceaccount.yaml
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: flink
+  namespace: ${NAMESPACE}
+  labels:
+    app: flink
+
diff --git a/e2e-iot/high-infra/k8s/flink/flink-taskmanager.yaml b/e2e-iot/high-infra/k8s/flink/flink-taskmanager.yaml
new file mode 100644
index 0000000..e06723c
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/flink-taskmanager.yaml
@@ -0,0 +1,179 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: flink-taskmanager
+  namespace: ${NAMESPACE}
+  labels:
+    app: flink
+    component: taskmanager
+spec:
+  serviceName: flink-taskmanager
+  replicas: 6
+  selector:
+    matchLabels:
+      app: flink
+      component: taskmanager
+  template:
+    metadata:
+      labels:
+        app: flink
+        component: taskmanager
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "9249"
+        prometheus.io/path: "/metrics"
+    spec:
+      serviceAccountName: flink
+      nodeSelector:
+        flink-component: taskmanager
+      tolerations:
+        - key: "flink-component"
+          operator: "Equal"
+          value: "taskmanager"
+          effect: "NoSchedule"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: flink-component
+                    operator: In
+                    values:
+                      - taskmanager
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            - labelSelector:
+                matchExpressions:
+                  - key: app
+                    operator: In
+                    values:
+                      - flink
+                  - key: component
+                    operator: In
+                    values:
+                      - taskmanager
+              topologyKey: kubernetes.io/hostname
+      initContainers:
+        - name: install-plugins
+          image: apache/flink:1.20.3-scala_2.12-java17
+          imagePullPolicy: IfNotPresent
+          command: ["sh", "-c"]
+          args:
+            - |
+              set -e
+              # Install S3 filesystem plugin
+              mkdir -p /opt/flink/plugins/s3-fs-hadoop
+              if [ -f /opt/flink/opt/flink-s3-fs-hadoop-1.20.3.jar ]; then
+                cp /opt/flink/opt/flink-s3-fs-hadoop-1.20.3.jar /opt/flink/plugins/s3-fs-hadoop/
+                echo "S3 plugin installed"
+              else
+                echo "WARNING: S3 plugin not found in /opt/flink/opt/, skipping"
+              fi
+              
+              # Install Prometheus metrics plugin
+              mkdir -p /opt/flink/plugins/metrics-prometheus
+              if [ -f /opt/flink/opt/flink-metrics-prometheus-1.20.3.jar ]; then
+                cp /opt/flink/opt/flink-metrics-prometheus-1.20.3.jar /opt/flink/plugins/metrics-prometheus/
+                echo "Prometheus plugin copied from /opt/flink/opt/"
+              elif [ -f /opt/flink/lib/flink-metrics-prometheus-1.20.3.jar ]; then
+                cp /opt/flink/lib/flink-metrics-prometheus-1.20.3.jar /opt/flink/plugins/metrics-prometheus/
+                echo "Prometheus plugin copied from /opt/flink/lib/"
+              else
+                echo "Downloading Prometheus metrics plugin..."
+                curl -L https://repo1.maven.org/maven2/org/apache/flink/flink-metrics-prometheus/1.20.3/flink-metrics-prometheus-1.20.3.jar \
+                  -o /opt/flink/plugins/metrics-prometheus/flink-metrics-prometheus-1.20.3.jar || {
+                  echo "ERROR: Failed to download Prometheus plugin"
+                  exit 1
+                }
+                echo "Prometheus plugin downloaded"
+              fi
+              
+              ls -la /opt/flink/plugins/s3-fs-hadoop/ || true
+              ls -la /opt/flink/plugins/metrics-prometheus/
+              echo "All plugins installed successfully"
+          volumeMounts:
+            - name: flink-plugins
+              mountPath: /opt/flink/plugins
+      containers:
+        - name: taskmanager
+          image: apache/flink:1.20.3-scala_2.12-java17
+          imagePullPolicy: IfNotPresent
+          command: ["/opt/flink/bin/taskmanager.sh"]
+          args: ["start-foreground"]
+          env:
+            - name: FLINK_PROPERTIES
+              value: "jobmanager.rpc.address: flink-jobmanager"
+            - name: JAVA_TOOL_OPTIONS
+              value: "--add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED"
+          ports:
+            - name: data
+              containerPort: 6122
+            - name: rpc
+              containerPort: 6123
+            - name: queryable-state
+              containerPort: 6125
+            - name: metrics
+              containerPort: 9249
+          volumeMounts:
+            - name: flink-config
+              mountPath: /opt/flink/conf
+            - name: flink-plugins
+              mountPath: /opt/flink/plugins
+          resources:
+            requests:
+              memory: "24Gi"
+              cpu: "15000m"
+            limits:
+              memory: "28Gi"
+              cpu: "15000m"
+      volumes:
+        - name: flink-config
+          configMap:
+            name: flink-config
+        - name: flink-plugins
+          emptyDir: {}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: flink-taskmanager
+  namespace: ${NAMESPACE}
+  labels:
+    app: flink
+    component: taskmanager
+spec:
+  type: ClusterIP
+  clusterIP: None  # Headless service
+  ports:
+    - name: data
+      port: 6122
+      targetPort: 6122
+    - name: rpc
+      port: 6123
+      targetPort: 6123
+    - name: queryable-state
+      port: 6125
+      targetPort: 6125
+    - name: metrics
+      port: 9249
+      targetPort: 9249
+  selector:
+    app: flink
+    component: taskmanager
diff --git a/e2e-iot/high-infra/k8s/flink/submit-flink-job-kubectl.sh b/e2e-iot/high-infra/k8s/flink/submit-flink-job-kubectl.sh
new file mode 100755
index 0000000..c6c82d3
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/submit-flink-job-kubectl.sh
@@ -0,0 +1,97 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -e
+
+# Script to submit Flink job using kubectl exec into JobManager
+# This script runs the Flink CLI from within the JobManager pod
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+NAMESPACE="${NAMESPACE:-fluss}"
+
+echo "=== Submitting Flink Aggregator Job via Flink CLI ==="
+echo ""
+
+# Get JobManager pod name
+JOBMANAGER_POD=$(kubectl get pod -n ${NAMESPACE} -l app=flink-jobmanager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+
+if [ -z "${JOBMANAGER_POD}" ]; then
+    echo "ERROR: Flink JobManager pod not found in namespace ${NAMESPACE}"
+    exit 1
+fi
+
+echo "Using JobManager pod: ${JOBMANAGER_POD}"
+echo ""
+
+# Check if JAR needs to be copied to the pod
+# For now, we'll assume the JAR is available in the demo image
+# You can copy it first using: kubectl cp <local-jar> ${NAMESPACE}/${JOBMANAGER_POD}:/tmp/fluss-flink-realtime-demo.jar
+
+echo "Submitting Flink job..."
+echo "Note: The JAR file needs to be available in the JobManager pod"
+echo ""
+
+# Option 1: If JAR is already in the pod
+# kubectl exec -n ${NAMESPACE} ${JOBMANAGER_POD} -- /opt/flink/bin/flink run \
+#     -c org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob \
+#     /tmp/fluss-flink-realtime-demo.jar \
+#     --bootstrap coordinator-server-hs.${NAMESPACE}.svc.cluster.local:9124 \
+#     --database iot \
+#     --table sensor_readings \
+#     --window-minutes 1
+
+# Option 2: Use REST API via curl from JobManager pod
+echo "Using REST API to submit job..."
+echo ""
+
+# First, we need to get the JAR into the pod or use a URL
+# For now, let's use the REST API approach with a helper pod
+
+echo "Creating temporary pod to submit job..."
+kubectl run flink-job-submitter-$(date +%s) \
+    --rm -i --restart=Never \
+    --image=curlimages/curl:latest \
+    -n ${NAMESPACE} \
+    -- sh -c "
+        apk add --no-cache jq >/dev/null 2>&1 || true
+        echo 'Uploading JAR...'
+        # Note: You need to provide the JAR file
+        # This is a template - you'll need to modify based on how you want to provide the JAR
+        echo 'ERROR: JAR file path required'
+        echo 'Please modify this script to provide the JAR file location'
+        echo 'Options:'
+        echo '  1. Copy JAR to a pod: kubectl cp <local-jar> ${NAMESPACE}/<pod>:/tmp/jar'
+        echo '  2. Host JAR on HTTP server and download it'
+        echo '  3. Use ConfigMap or PersistentVolume'
+    " || true
+
+echo ""
+echo "For manual submission, you can:"
+echo ""
+echo "1. Port-forward Flink JobManager:"
+echo "   kubectl port-forward -n ${NAMESPACE} svc/flink-jobmanager 8081:8081"
+echo ""
+echo "2. Upload JAR via REST API:"
+echo "   curl -X POST -H 'Expect:' -F 'jarfile=@/path/to/fluss-flink-realtime-demo.jar' http://localhost:8081/v1/jars/upload"
+echo ""
+echo "3. Submit job:"
+echo "   curl -X POST http://localhost:8081/v1/jars/<jar-id>/run \\"
+echo "     -H 'Content-Type: application/json' \\"
+echo "     -d '{\"entryClass\":\"org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob\",\"programArgs\":\"--bootstrap coordinator-server-hs.${NAMESPACE}.svc.cluster.local:9124 --database iot --table sensor_readings --window-minutes 1\",\"parallelism\":2}'"
+echo ""
+
diff --git a/e2e-iot/high-infra/k8s/flink/submit-flink-job.sh b/e2e-iot/high-infra/k8s/flink/submit-flink-job.sh
new file mode 100755
index 0000000..7fd86a6
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/submit-flink-job.sh
@@ -0,0 +1,133 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -e
+
+# Script to manually submit Flink aggregator job to Flink cluster
+# Usage: ./submit-flink-job.sh [jar-path] [flink-jobmanager-url]
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+NAMESPACE="${NAMESPACE:-fluss}"
+
+# Default values
+JAR_PATH="${1:-/app/fluss-flink-realtime-demo.jar}"
+JOBMANAGER="${2:-flink-jobmanager.${NAMESPACE}.svc.cluster.local:8081}"
+
+# If JAR_PATH is a local file, we need to upload it first
+# Otherwise, assume it's already in the cluster or accessible
+
+echo "=== Submitting Flink Aggregator Job ==="
+echo "JAR Path: ${JAR_PATH}"
+echo "Flink JobManager: ${JOBMANAGER}"
+echo ""
+
+# Check if we're running inside Kubernetes or locally
+if [ -n "${KUBERNETES_SERVICE_HOST:-}" ]; then
+    echo "Running inside Kubernetes cluster..."
+    
+    # If JAR is a local path, check if it exists
+    if [ -f "${JAR_PATH}" ]; then
+        echo "Found JAR at ${JAR_PATH}"
+        JAR_TO_UPLOAD="${JAR_PATH}"
+    else
+        echo "ERROR: JAR file not found at ${JAR_PATH}"
+        echo "Please provide the path to the JAR file"
+        exit 1
+    fi
+    
+    # Upload JAR to Flink cluster
+    echo "Uploading JAR to Flink cluster..."
+    UPLOAD_RESPONSE=$(curl -s -X POST \
+        "http://${JOBMANAGER}/v1/jars/upload" \
+        -H "Content-Type: multipart/form-data" \
+        -F "jarfile=@${JAR_TO_UPLOAD}")
+    
+    echo "Upload response: ${UPLOAD_RESPONSE}"
+    
+    # Extract JAR ID from response
+    if command -v jq &> /dev/null; then
+        JAR_ID=$(echo "${UPLOAD_RESPONSE}" | jq -r '.filename' | sed 's|.*/||')
+    else
+        JAR_ID=$(echo "${UPLOAD_RESPONSE}" | grep -o '"filename":"[^"]*' | sed 's/"filename":"//' | sed 's|.*/||')
+    fi
+    
+    if [ -z "${JAR_ID}" ] || [ "${JAR_ID}" = "null" ]; then
+        echo "ERROR: Failed to upload JAR"
+        echo "Response: ${UPLOAD_RESPONSE}"
+        exit 1
+    fi
+    
+    echo "JAR uploaded with ID: ${JAR_ID}"
+    
+    # Submit job
+    echo "Submitting job..."
+    JOB_RESPONSE=$(curl -s -X POST \
+        "http://${JOBMANAGER}/v1/jars/${JAR_ID}/run" \
+        -H "Content-Type: application/json" \
+        -d '{
+            "entryClass": "org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob",
+            "programArgs": "--bootstrap coordinator-server-hs.fluss.svc.cluster.local:9124 --database iot --table sensor_readings --window-minutes 1",
+            "parallelism": 2,
+            "savepointPath": null,
+            "allowNonRestoredState": false
+        }')
+    
+    echo "Job submission response: ${JOB_RESPONSE}"
+    
+    # Extract Job ID from response
+    if command -v jq &> /dev/null; then
+        JOB_ID=$(echo "${JOB_RESPONSE}" | jq -r '.jobid')
+    else
+        JOB_ID=$(echo "${JOB_RESPONSE}" | grep -o '"jobid":"[^"]*' | sed 's/"jobid":"//')
+    fi
+    
+    if [ -z "${JOB_ID}" ] || [ "${JOB_ID}" = "null" ]; then
+        echo "ERROR: Failed to submit job"
+        echo "Response: ${JOB_RESPONSE}"
+        exit 1
+    fi
+    
+    echo ""
+    echo "✓ Job submitted successfully!"
+    echo "Job ID: ${JOB_ID}"
+    echo "Job status URL: http://${JOBMANAGER}/#/job/${JOB_ID}"
+    
+    # Wait a bit and check job status
+    sleep 5
+    if command -v jq &> /dev/null; then
+        JOB_STATUS=$(curl -s "http://${JOBMANAGER}/jobs/${JOB_ID}" | jq -r '.state // "UNKNOWN"')
+    else
+        JOB_STATUS=$(curl -s "http://${JOBMANAGER}/jobs/${JOB_ID}" | grep -o '"state":"[^"]*' | sed 's/"state":"//' || echo "UNKNOWN")
+    fi
+    echo "Job status: ${JOB_STATUS}"
+    
+else
+    echo "Running locally - using kubectl to execute in cluster..."
+    echo ""
+    echo "Option 1: Run script inside a pod with the JAR"
+    echo "  kubectl run flink-job-submitter --rm -it --image=curlimages/curl:latest --restart=Never -n ${NAMESPACE} -- sh"
+    echo ""
+    echo "Option 2: Use Flink CLI from JobManager pod"
+    echo "  kubectl exec -it -n ${NAMESPACE} \$(kubectl get pod -n ${NAMESPACE} -l app=flink-jobmanager -o jsonpath='{.items[0].metadata.name}') -- /opt/flink/bin/flink run -c org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob /path/to/jar"
+    echo ""
+    echo "Option 3: Port-forward and use REST API"
+    echo "  kubectl port-forward -n ${NAMESPACE} svc/flink-jobmanager 8081:8081"
+    echo "  Then run this script with JAR path"
+    exit 1
+fi
+
diff --git a/e2e-iot/high-infra/k8s/flink/submit-job-from-image.sh b/e2e-iot/high-infra/k8s/flink/submit-job-from-image.sh
new file mode 100755
index 0000000..706c1db
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/submit-job-from-image.sh
@@ -0,0 +1,169 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -e
+
+# Submit Flink job using flink run command
+# The JAR is already embedded in the Flink image at /opt/flink/usrlib/fluss-flink-realtime-demo.jar
+# This script uses the JAR directly from the image without uploading
+
+NAMESPACE="${NAMESPACE:-fluss}"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Note: S3 ConfigMap configuration is now handled by deploy.sh during deployment
+# No need to update it here during job submission
+echo ""
+
+JOBMANAGER_POD=$(kubectl get pod -n ${NAMESPACE} -l app=flink,component=jobmanager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+
+if [ -z "${JOBMANAGER_POD}" ]; then
+    echo "ERROR: Flink JobManager pod not found in namespace ${NAMESPACE}"
+    exit 1
+fi
+
+echo "=== Submitting Flink Aggregator Job ==="
+echo "JobManager Pod: ${JOBMANAGER_POD}"
+echo "Namespace: ${NAMESPACE}"
+echo "JAR Location: /opt/flink/usrlib/fluss-flink-realtime-demo.jar (from image)"
+echo ""
+
+# Cancel existing running jobs
+echo "[1/3] Cancelling existing Flink jobs..."
+EXISTING_JOBS=$(kubectl exec -n ${NAMESPACE} ${JOBMANAGER_POD} -- curl -s http://localhost:8081/jobs 2>/dev/null | jq -r '.jobs[]? | select(.status == "RUNNING" or .status == "CREATED") | .id' 2>/dev/null || echo "")
+
+if [ -n "${EXISTING_JOBS}" ]; then
+    echo "${EXISTING_JOBS}" | while read job_id; do
+        if [ -n "${job_id}" ]; then
+            echo "  Cancelling job: ${job_id}"
+            kubectl exec -n ${NAMESPACE} ${JOBMANAGER_POD} -- curl -s -X PATCH "http://localhost:8081/jobs/${job_id}" > /dev/null 2>&1
+            if [ $? -eq 0 ]; then
+                echo "    ✓ Job ${job_id} cancelled"
+            else
+                echo "    ⚠ Failed to cancel job ${job_id}"
+            fi
+        fi
+    done
+    # Wait a moment for jobs to be cancelled
+    sleep 3
+else
+    echo "  ℹ No running jobs found"
+fi
+echo ""
+
+# Verify JAR exists in the image
+echo "[2/3] Verifying JAR exists in Flink image..."
+JAR_EXISTS=$(kubectl exec -n ${NAMESPACE} ${JOBMANAGER_POD} -- test -f /opt/flink/usrlib/fluss-flink-realtime-demo.jar && echo "yes" || echo "no")
+
+if [ "${JAR_EXISTS}" != "yes" ]; then
+    echo "ERROR: JAR file not found at /opt/flink/usrlib/fluss-flink-realtime-demo.jar"
+    echo "Please ensure the Flink image contains the JAR file"
+    echo "You may need to rebuild and push the image using build-and-push.sh"
+    exit 1
+fi
+
+JAR_SIZE=$(kubectl exec -n ${NAMESPACE} ${JOBMANAGER_POD} -- ls -lh /opt/flink/usrlib/fluss-flink-realtime-demo.jar | awk '{print $5}')
+echo "  ✓ JAR found (size: ${JAR_SIZE})"
+echo ""
+
+# Submit job using REST API with local JAR path
+echo "[3/3] Submitting job via REST API using local JAR..."
+# Use local:// protocol to reference JAR from JobManager's local filesystem
+JOB_RESPONSE=$(kubectl exec -n ${NAMESPACE} ${JOBMANAGER_POD} -- curl -s -X POST \
+    "http://localhost:8081/v1/jobs" \
+    -H "Content-Type: application/json" \
+    -d "{
+        \"jarFile\": \"local:///opt/flink/usrlib/fluss-flink-realtime-demo.jar\",
+        \"entryClass\": \"org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob\",
+        \"programArgs\": \"--bootstrap coordinator-server-hs.${NAMESPACE}.svc.cluster.local:9124 --database iot --table sensor_readings --window-minutes 1\",
+        \"parallelism\": 192
+    }" 2>&1)
+
+echo "Job submission response: ${JOB_RESPONSE}"
+
+# Extract Job ID from response
+JOB_ID=$(echo "${JOB_RESPONSE}" | grep -o '"jobid":"[^"]*' | sed 's/"jobid":"//' || echo "")
+
+if [ -z "${JOB_ID}" ]; then
+    # Try alternative extraction
+    JOB_ID=$(echo "${JOB_RESPONSE}" | jq -r '.jobid // empty' 2>/dev/null || echo "")
+fi
+
+if [ -z "${JOB_ID}" ]; then
+    echo ""
+    echo "⚠️  Could not extract Job ID from response"
+    echo "Response: ${JOB_RESPONSE}"
+    echo ""
+    echo "Trying alternative method: Upload JAR first, then submit..."
+    
+    # Alternative: Upload JAR from local path, then submit
+    UPLOAD_RESPONSE=$(kubectl exec -n ${NAMESPACE} ${JOBMANAGER_POD} -- curl -s -X POST \
+        "http://localhost:8081/v1/jars/upload" \
+        -H "Content-Type: multipart/form-data" \
+        -F "jarfile=@/opt/flink/usrlib/fluss-flink-realtime-demo.jar" 2>&1)
+    
+    JAR_ID=$(echo "${UPLOAD_RESPONSE}" | grep -o 'flink-web-upload/[^"]*' | sed 's|flink-web-upload/||' || echo "")
+    
+    if [ -z "${JAR_ID}" ]; then
+        echo "ERROR: Failed to upload JAR"
+        echo "Upload response: ${UPLOAD_RESPONSE}"
+        exit 1
+    fi
+    
+    echo "✓ JAR uploaded with ID: ${JAR_ID}"
+    
+    # Submit job
+    JOB_RESPONSE=$(kubectl exec -n ${NAMESPACE} ${JOBMANAGER_POD} -- curl -s -X POST \
+        "http://localhost:8081/v1/jars/${JAR_ID}/run" \
+        -H "Content-Type: application/json" \
+        -d "{
+            \"entryClass\": \"org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob\",
+            \"programArgs\": \"--bootstrap coordinator-server-hs.${NAMESPACE}.svc.cluster.local:9124 --database iot --table sensor_readings --window-minutes 1\",
+            \"parallelism\": 192
+        }" 2>&1)
+    
+    JOB_ID=$(echo "${JOB_RESPONSE}" | grep -o '"jobid":"[^"]*' | sed 's/"jobid":"//' || echo "")
+fi
+
+if [ -z "${JOB_ID}" ]; then
+    echo ""
+    echo "ERROR: Failed to extract Job ID from submission response"
+    echo "Response: ${JOB_RESPONSE}"
+    exit 1
+fi
+
+echo ""
+echo "✓ Job submitted successfully!"
+echo "Job ID: ${JOB_ID}"
+
+echo ""
+echo "Monitor job at:"
+echo "  kubectl port-forward -n ${NAMESPACE} svc/flink-jobmanager 8081:8081"
+if [ -n "${JOB_ID}" ]; then
+    echo "  Then open: http://localhost:8081/#/job/${JOB_ID}"
+else
+    echo "  Then open: http://localhost:8081"
+fi
+echo ""
+echo "View metrics dashboard:"
+echo "  kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80"
+echo "  Then open: http://localhost:3000"
+echo "  Username: admin, Password: admin123"
+echo ""
+echo "Check job logs:"
+echo "  kubectl logs -n ${NAMESPACE} ${JOBMANAGER_POD} -f"
+
diff --git a/e2e-iot/high-infra/k8s/flink/submit-job-local.sh b/e2e-iot/high-infra/k8s/flink/submit-job-local.sh
new file mode 100755
index 0000000..173ed4e
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/submit-job-local.sh
@@ -0,0 +1,116 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -e
+
+# Submit Flink job using REST API
+# The JAR is embedded in the Flink image at /opt/flink/usrlib/fluss-flink-realtime-demo.jar
+
+NAMESPACE="${NAMESPACE:-fluss}"
+JOBMANAGER_POD=$(kubectl get pod -n ${NAMESPACE} -l app=flink,component=jobmanager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+
+if [ -z "${JOBMANAGER_POD}" ]; then
+    echo "ERROR: Flink JobManager pod not found in namespace ${NAMESPACE}"
+    exit 1
+fi
+
+echo "=== Submitting Flink Aggregator Job ==="
+echo "JobManager Pod: ${JOBMANAGER_POD}"
+echo "Namespace: ${NAMESPACE}"
+echo ""
+
+# Cancel existing running jobs
+echo "[1/4] Cancelling existing Flink jobs..."
+EXISTING_JOBS=$(kubectl exec -n ${NAMESPACE} ${JOBMANAGER_POD} -- curl -s http://localhost:8081/jobs 2>/dev/null | jq -r '.jobs[]? | select(.status == "RUNNING" or .status == "CREATED") | .id' 2>/dev/null || echo "")
+
+if [ -n "${EXISTING_JOBS}" ]; then
+    echo "${EXISTING_JOBS}" | while read job_id; do
+        if [ -n "${job_id}" ]; then
+            echo "  Cancelling job: ${job_id}"
+            kubectl exec -n ${NAMESPACE} ${JOBMANAGER_POD} -- curl -s -X PATCH "http://localhost:8081/jobs/${job_id}" > /dev/null 2>&1
+            if [ $? -eq 0 ]; then
+                echo "    ✓ Job ${job_id} cancelled"
+            else
+                echo "    ⚠ Failed to cancel job ${job_id}"
+            fi
+        fi
+    done
+    # Wait a moment for jobs to be cancelled
+    sleep 3
+else
+    echo "  ℹ No running jobs found"
+fi
+echo ""
+
+# Upload JAR via REST API
+echo "[2/3] Uploading JAR to Flink cluster..."
+echo "Uploading JAR to Flink cluster..."
+UPLOAD_RESPONSE=$(kubectl exec -n ${NAMESPACE} ${JOBMANAGER_POD} -- curl -s -X POST \
+    "http://localhost:8081/v1/jars/upload" \
+    -H "Content-Type: multipart/form-data" \
+    -F "jarfile=@/opt/flink/usrlib/fluss-flink-realtime-demo.jar")
+
+echo "Upload response: ${UPLOAD_RESPONSE}"
+
+# Extract JAR ID from response
+JAR_ID=$(echo "${UPLOAD_RESPONSE}" | grep -o 'flink-web-upload/[^"]*' | sed 's|flink-web-upload/||' || echo "")
+
+if [ -z "${JAR_ID}" ]; then
+    echo "ERROR: Failed to extract JAR ID from upload response"
+    echo "Response: ${UPLOAD_RESPONSE}"
+    exit 1
+fi
+
+echo "✓ JAR uploaded with ID: ${JAR_ID}"
+echo ""
+
+# Submit job via REST API
+echo "[3/3] Submitting job via REST API..."
+JOB_RESPONSE=$(kubectl exec -n ${NAMESPACE} ${JOBMANAGER_POD} -- curl -s -X POST \
+    "http://localhost:8081/v1/jars/${JAR_ID}/run" \
+    -H "Content-Type: application/json" \
+    -d "{
+        \"entryClass\": \"org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob\",
+        \"programArgs\": \"--bootstrap coordinator-server-hs.${NAMESPACE}.svc.cluster.local:9124 --database iot --table sensor_readings --window-minutes 1\",
+        \"parallelism\": 192
+    }")
+
+echo "Job submission response: ${JOB_RESPONSE}"
+
+# Extract Job ID from response
+JOB_ID=$(echo "${JOB_RESPONSE}" | grep -o '"jobid":"[^"]*' | sed 's/"jobid":"//' || echo "")
+
+if [ -z "${JOB_ID}" ]; then
+    echo "ERROR: Failed to extract Job ID from submission response"
+    echo "Response: ${JOB_RESPONSE}"
+    exit 1
+fi
+
+echo ""
+echo "✓ Job submitted successfully!"
+echo "Job ID: ${JOB_ID}"
+echo ""
+echo "Monitor job at:"
+echo "  kubectl port-forward -n ${NAMESPACE} svc/flink-jobmanager 8081:8081"
+echo "  Then open: http://localhost:8081/#/job/${JOB_ID}"
+echo ""
+echo "View metrics dashboard:"
+echo "  kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80"
+echo "  Then open: http://localhost:3000"
+echo "  Username: admin, Password: admin123"
+
diff --git a/e2e-iot/high-infra/k8s/flink/submit-job.sh b/e2e-iot/high-infra/k8s/flink/submit-job.sh
new file mode 100755
index 0000000..33dcef4
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/flink/submit-job.sh
@@ -0,0 +1,145 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -e
+
+# Simple script to submit Flink job manually
+# Prerequisites:
+#   1. Port-forward Flink JobManager: kubectl port-forward -n fluss svc/flink-jobmanager 8081:8081
+#   2. Have the JAR file available locally
+
+NAMESPACE="${NAMESPACE:-fluss}"
+JOBMANAGER="${JOBMANAGER:-localhost:8081}"
+JAR_PATH="${1:-}"
+
+if [ -z "${JAR_PATH}" ]; then
+    echo "Usage: $0 <path-to-jar-file>"
+    echo ""
+    echo "Example:"
+    echo "  $0 /path/to/fluss-flink-realtime-demo.jar"
+    echo ""
+    echo "Prerequisites:"
+    echo "  1. Port-forward Flink JobManager:"
+    echo "     kubectl port-forward -n ${NAMESPACE} svc/flink-jobmanager 8081:8081"
+    echo "  2. Have the JAR file available"
+    exit 1
+fi
+
+if [ ! -f "${JAR_PATH}" ]; then
+    echo "ERROR: JAR file not found: ${JAR_PATH}"
+    exit 1
+fi
+
+echo "=== Submitting Flink Aggregator Job ==="
+echo "JAR: ${JAR_PATH}"
+echo "JobManager: ${JOBMANAGER}"
+echo ""
+
+# Check if JobManager is accessible
+if ! curl -s "http://${JOBMANAGER}/overview" > /dev/null 2>&1; then
+    echo "ERROR: Cannot connect to Flink JobManager at http://${JOBMANAGER}"
+    echo "Make sure port-forward is running:"
+    echo "  kubectl port-forward -n ${NAMESPACE} svc/flink-jobmanager 8081:8081"
+    exit 1
+fi
+
+# Upload JAR
+echo "Uploading JAR to Flink cluster..."
+UPLOAD_RESPONSE=$(curl -s -X POST \
+    "http://${JOBMANAGER}/v1/jars/upload" \
+    -H "Expect:" \
+    -F "jarfile=@${JAR_PATH}")
+
+echo "Upload response: ${UPLOAD_RESPONSE}"
+
+# Extract JAR ID
+if command -v jq &> /dev/null; then
+    JAR_ID=$(echo "${UPLOAD_RESPONSE}" | jq -r '.filename' | sed 's|.*/||')
+else
+    JAR_ID=$(echo "${UPLOAD_RESPONSE}" | grep -o '"filename":"[^"]*' | sed 's/"filename":"//' | sed 's|.*/||')
+fi
+
+if [ -z "${JAR_ID}" ] || [ "${JAR_ID}" = "null" ]; then
+    echo "ERROR: Failed to upload JAR"
+    echo "Response: ${UPLOAD_RESPONSE}"
+    exit 1
+fi
+
+echo "✓ JAR uploaded with ID: ${JAR_ID}"
+echo ""
+
+# Submit job
+echo "Submitting job..."
+JOB_RESPONSE=$(curl -s -X POST \
+    "http://${JOBMANAGER}/v1/jars/${JAR_ID}/run" \
+    -H "Content-Type: application/json" \
+    -d '{
+        "entryClass": "org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob",
+        "programArgs": "--bootstrap coordinator-server-hs.'"${NAMESPACE}"'.svc.cluster.local:9124 --database iot --table sensor_readings --window-minutes 1",
+        "parallelism": 32,
+        "savepointPath": null,
+        "allowNonRestoredState": false
+    }')
+
+echo "Job submission response: ${JOB_RESPONSE}"
+echo ""
+
+# Extract Job ID
+if command -v jq &> /dev/null; then
+    JOB_ID=$(echo "${JOB_RESPONSE}" | jq -r '.jobid // empty')
+    ERRORS=$(echo "${JOB_RESPONSE}" | jq -r '.errors[]? // empty' 2>/dev/null || echo "")
+else
+    JAR_ID=$(echo "${JOB_RESPONSE}" | grep -o '"jobid":"[^"]*' | sed 's/"jobid":"//' || echo "")
+    ERRORS=$(echo "${JOB_RESPONSE}" | grep -o '"errors":\["[^"]*' | sed 's/"errors":\["//' || echo "")
+fi
+
+if [ -n "${ERRORS}" ]; then
+    echo "ERROR: Job submission failed"
+    echo "${ERRORS}"
+    exit 1
+fi
+
+if [ -z "${JOB_ID}" ] || [ "${JOB_ID}" = "null" ]; then
+    echo "ERROR: Failed to extract Job ID from response"
+    echo "Response: ${JOB_RESPONSE}"
+    exit 1
+fi
+
+echo "✓ Job submitted successfully!"
+echo "Job ID: ${JOB_ID}"
+echo "Job URL: http://${JOBMANAGER}/#/job/${JOB_ID}"
+echo ""
+
+# Check job status
+sleep 3
+if command -v jq &> /dev/null; then
+    JOB_STATUS=$(curl -s "http://${JOBMANAGER}/jobs/${JOB_ID}" | jq -r '.state // "UNKNOWN"')
+else
+    JOB_STATUS=$(curl -s "http://${JOBMANAGER}/jobs/${JOB_ID}" | grep -o '"state":"[^"]*' | sed 's/"state":"//' || echo "UNKNOWN")
+fi
+
+echo "Job status: ${JOB_STATUS}"
+
+if [ "${JOB_STATUS}" = "FAILED" ] || [ "${JOB_STATUS}" = "CANCELED" ]; then
+    echo "WARNING: Job is in ${JOB_STATUS} state"
+    exit 1
+fi
+
+echo ""
+echo "Job is running! Monitor it at: http://${JOBMANAGER}/#/job/${JOB_ID}"
+
diff --git a/e2e-iot/high-infra/k8s/jobs/PRODUCER_CONFIG.md b/e2e-iot/high-infra/k8s/jobs/PRODUCER_CONFIG.md
new file mode 100644
index 0000000..26951df
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/jobs/PRODUCER_CONFIG.md
@@ -0,0 +1,88 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Producer Optimal Configuration
+
+This document describes the optimal configuration parameters for the Fluss producer based on performance testing.
+
+## Optimal Settings
+
+### Performance Configuration
+- **PRODUCER_RATE**: `200000` records/sec
+- **PRODUCER_FLUSH_EVERY**: `5000` records
+- **CLIENT_WRITER_BATCH_TIMEOUT**: `90ms` (or `50ms` for lower latency)
+- **CLIENT_WRITER_BUFFER_MEMORY_SIZE**: `2gb`
+- **CLIENT_WRITER_BATCH_SIZE**: `128mb`
+
+### Resource Configuration
+- **PRODUCER_MEMORY_REQUEST**: `4Gi`
+- **PRODUCER_MEMORY_LIMIT**: `16Gi`
+- **PRODUCER_CPU_REQUEST**: `2000m`
+- **PRODUCER_CPU_LIMIT**: `8000m`
+
+### Threading Configuration
+- **NUM_WRITER_THREADS**: `48`
+- **BUCKETS**: `48`
+
+## Quick Deploy
+
+Use the optimal deployment script:
+
+```bash
+cd aws-deploy-fluss/high-infra/k8s/jobs
+./deploy-producer-optimal.sh
+```
+
+Or with custom parameters:
+
+```bash
+./deploy-producer-optimal.sh --rate 200000 --flush 5000 --batch-timeout 90ms
+```
+
+## Performance Impact
+
+### Batch Timeout
+- **10ms**: Low latency but poor throughput (~11,600 ops/sec)
+- **50ms**: Good balance (~172,000 ops/sec)
+- **90ms**: Higher throughput, slightly higher latency (~187,000+ ops/sec)
+
+### Flush Interval
+- **1000 records**: Too frequent, high overhead (~59,000 ops/sec)
+- **5000 records**: Optimal balance (~187,000+ ops/sec)
+- **20000+ records**: Higher throughput but higher latency
+
+## Default Values
+
+### Java Code Defaults
+- `PRODUCER_RATE`: 200,000
+- `PRODUCER_FLUSH_EVERY`: 200,000
+- `CLIENT_WRITER_BATCH_TIMEOUT`: 10ms (updated to 50ms in code)
+
+### Deploy Script Defaults
+- `PRODUCER_RATE`: 200,000
+- `PRODUCER_FLUSH_EVERY`: 5,000
+- `CLIENT_WRITER_BATCH_TIMEOUT`: 50ms
+
+## Notes
+
+- The optimal configuration balances latency vs throughput
+- Batch timeout of 90ms allows more records to accumulate before sending
+- Flush interval of 5000 reduces CPU overhead from frequent flushing
+- These settings achieved ~187K+ ops/sec in testing
+
+
diff --git a/e2e-iot/high-infra/k8s/jobs/check-table-buckets.sh b/e2e-iot/high-infra/k8s/jobs/check-table-buckets.sh
new file mode 100755
index 0000000..1a53949
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/jobs/check-table-buckets.sh
@@ -0,0 +1,212 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+
+# Script to check Fluss table bucket count using Fluss Admin API
+# Can be run locally (with port-forward) or via kubectl exec
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Default values
+NAMESPACE="${NAMESPACE:-fluss}"
+BOOTSTRAP="${BOOTSTRAP:-coordinator-server-hs.fluss.svc.cluster.local:9124}"
+DATABASE="${DATABASE:-iot}"
+TABLE="${TABLE:-sensor_readings}"
+DEMO_IMAGE_REPO="${DEMO_IMAGE_REPO:-343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo}"
+DEMO_IMAGE_TAG="${DEMO_IMAGE_TAG:-latest}"
+USE_KUBECTL_EXEC="${USE_KUBECTL_EXEC:-false}"
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --namespace)
+            NAMESPACE="$2"
+            shift 2
+            ;;
+        --bootstrap)
+            BOOTSTRAP="$2"
+            shift 2
+            ;;
+        --database)
+            DATABASE="$2"
+            shift 2
+            ;;
+        --table)
+            TABLE="$2"
+            shift 2
+            ;;
+        --image-repo)
+            DEMO_IMAGE_REPO="$2"
+            shift 2
+            ;;
+        --image-tag)
+            DEMO_IMAGE_TAG="$2"
+            shift 2
+            ;;
+        --kubectl-exec)
+            USE_KUBECTL_EXEC="true"
+            shift
+            ;;
+        --help)
+            echo "Usage: $0 [OPTIONS]"
+            echo ""
+            echo "Check Fluss table bucket count using Fluss Admin API"
+            echo ""
+            echo "Options:"
+            echo "  --namespace NAMESPACE     Kubernetes namespace (default: fluss)"
+            echo "  --bootstrap BOOTSTRAP     Fluss coordinator address"
+            echo "                            Default: coordinator-server-hs.fluss.svc.cluster.local:9124"
+            echo "                            Use localhost:9124 if port-forwarding"
+            echo "  --database DATABASE       Database name (default: iot)"
+            echo "  --table TABLE             Table name (default: sensor_readings)"
+            echo "  --image-repo REPO         Docker image repository (for kubectl exec)"
+            echo "  --image-tag TAG           Docker image tag (for kubectl exec)"
+            echo "  --kubectl-exec            Run via kubectl exec instead of locally"
+            echo "  --help                    Show this help message"
+            echo ""
+            echo "Examples:"
+            echo "  # Check via port-forward (port-forward must be running):"
+            echo "  kubectl port-forward -n fluss svc/coordinator-server-hs 9124:9124 &"
+            echo "  $0 --bootstrap localhost:9124"
+            echo ""
+            echo "  # Check via kubectl exec (runs inside cluster):"
+            echo "  $0 --kubectl-exec"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+echo "=== Checking Fluss Table Bucket Count ==="
+echo "  Bootstrap: ${BOOTSTRAP}"
+echo "  Database: ${DATABASE}"
+echo "  Table: ${TABLE}"
+echo ""
+
+if [ "${USE_KUBECTL_EXEC}" = "true" ]; then
+    # Run via kubectl exec - need to find a pod with the demo jar
+    echo "Running check via kubectl exec..."
+    
+    # Try to find a pod with the demo image
+    POD_NAME=$(kubectl get pods -n "${NAMESPACE}" -l app=fluss-setup,component=table-creator -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+    
+    if [ -z "${POD_NAME}" ]; then
+        # Try producer pod
+        POD_NAME=$(kubectl get pods -n "${NAMESPACE}" -l app=fluss-producer -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+    fi
+    
+    if [ -z "${POD_NAME}" ]; then
+        echo "ERROR: No pod found with demo image. Creating a temporary pod..."
+        
+        # Create a temporary job to check bucket count
+        cat <<EOF | kubectl apply -f -
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: fluss-check-buckets-$(date +%s)
+  namespace: ${NAMESPACE}
+spec:
+  ttlSecondsAfterFinished: 60
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: check-buckets
+          image: ${DEMO_IMAGE_REPO}:${DEMO_IMAGE_TAG}
+          imagePullPolicy: Always
+          command:
+            - java
+          args:
+            - --add-opens=java.base/java.util=ALL-UNNAMED
+            - --add-opens=java.base/java.lang=ALL-UNNAMED
+            - --add-opens=java.base/java.nio=ALL-UNNAMED
+            - --add-opens=java.base/java.time=ALL-UNNAMED
+            - -cp
+            - /opt/flink/usrlib/fluss-flink-realtime-demo.jar
+            - org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussTableBucketChecker
+            - ${BOOTSTRAP}
+            - ${DATABASE}
+            - ${TABLE}
+          resources:
+            requests:
+              memory: "512Mi"
+              cpu: "500m"
+            limits:
+              memory: "1Gi"
+              cpu: "1000m"
+EOF
+        
+        JOB_NAME=$(kubectl get jobs -n "${NAMESPACE}" -l job-name=fluss-check-buckets --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[-1].metadata.name}' 2>/dev/null || echo "")
+        
+        if [ -z "${JOB_NAME}" ]; then
+            echo "ERROR: Failed to create check job"
+            exit 1
+        fi
+        
+        echo "Waiting for job to complete..."
+        kubectl wait --for=condition=complete --timeout=60s job/${JOB_NAME} -n "${NAMESPACE}" || true
+        
+        echo ""
+        echo "Job logs:"
+        kubectl logs -n "${NAMESPACE}" -l job-name=${JOB_NAME} --tail=50
+        
+        # Cleanup
+        kubectl delete job ${JOB_NAME} -n "${NAMESPACE}" --ignore-not-found=true
+        
+    else
+        echo "Using pod: ${POD_NAME}"
+        kubectl exec -n "${NAMESPACE}" "${POD_NAME}" -- java \
+            --add-opens=java.base/java.util=ALL-UNNAMED \
+            --add-opens=java.base/java.lang=ALL-UNNAMED \
+            --add-opens=java.base/java.nio=ALL-UNNAMED \
+            --add-opens=java.base/java.time=ALL-UNNAMED \
+            -cp /opt/flink/usrlib/fluss-flink-realtime-demo.jar \
+            org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussTableBucketChecker \
+            "${BOOTSTRAP}" "${DATABASE}" "${TABLE}"
+    fi
+else
+    # Run locally - need the demo jar
+    DEMO_JAR="${SCRIPT_DIR}/../../../demos/demo/fluss_flink_realtime_demo/target/fluss-flink-realtime-demo.jar"
+    
+    if [ ! -f "${DEMO_JAR}" ]; then
+        echo "ERROR: Demo JAR not found at ${DEMO_JAR}"
+        echo "Please build it first:"
+        echo "  mvn -pl demos/demo/fluss_flink_realtime_demo -am clean package"
+        echo ""
+        echo "Or use --kubectl-exec to run inside the cluster"
+        exit 1
+    fi
+    
+    echo "Running check locally..."
+    java \
+        --add-opens=java.base/java.util=ALL-UNNAMED \
+        --add-opens=java.base/java.lang=ALL-UNNAMED \
+        --add-opens=java.base/java.nio=ALL-UNNAMED \
+        --add-opens=java.base/java.time=ALL-UNNAMED \
+        -cp "${DEMO_JAR}" \
+        org.apache.fluss.benchmark.e2eplatformaws.inspect.FlussTableBucketChecker \
+        "${BOOTSTRAP}" "${DATABASE}" "${TABLE}"
+fi
+
+
diff --git a/e2e-iot/high-infra/k8s/jobs/create-table-job.yaml b/e2e-iot/high-infra/k8s/jobs/create-table-job.yaml
new file mode 100644
index 0000000..eef35f8
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/jobs/create-table-job.yaml
@@ -0,0 +1,85 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: fluss-create-table
+  namespace: ${NAMESPACE}
+  labels:
+    app: fluss-setup
+    component: table-creator
+spec:
+  backoffLimit: 3
+  ttlSecondsAfterFinished: 300
+  template:
+    metadata:
+      labels:
+        app: fluss-setup
+        component: table-creator
+    spec:
+      restartPolicy: OnFailure
+      initContainers:
+        - name: wait-for-coordinator
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              echo "Waiting for Fluss coordinator to be ready..."
+              until nc -z coordinator-server-hs.${NAMESPACE}.svc.cluster.local 9124; do
+                echo "Coordinator not ready, waiting..."
+                sleep 2
+              done
+              echo "Coordinator is ready!"
+      containers:
+        - name: create-table
+          image: ${DEMO_IMAGE_REPO}:${DEMO_IMAGE_TAG}
+          imagePullPolicy: Always
+          command:
+            - java
+          args:
+            - --add-opens=java.base/java.util=ALL-UNNAMED
+            - --add-opens=java.base/java.lang=ALL-UNNAMED
+            - --add-opens=java.base/java.nio=ALL-UNNAMED
+            - --add-opens=java.base/java.time=ALL-UNNAMED
+            - -cp
+            - /opt/flink/usrlib/fluss-flink-realtime-demo.jar
+            - org.apache.fluss.benchmark.e2eplatformaws.setup.CreateTableWithBuckets
+            - ${BOOTSTRAP}
+            - ${DATABASE}
+            - ${TABLE}
+            - "${BUCKETS}"
+            - "true"
+          env:
+            - name: BOOTSTRAP
+              value: "${BOOTSTRAP}"
+            - name: DATABASE
+              value: "${DATABASE}"
+            - name: TABLE
+              value: "${TABLE}"
+            - name: BUCKETS
+              value: "${BUCKETS}"
+          resources:
+            requests:
+              memory: "512Mi"
+              cpu: "500m"
+            limits:
+              memory: "1Gi"
+              cpu: "1000m"
+
+
diff --git a/e2e-iot/high-infra/k8s/jobs/create-table.sh b/e2e-iot/high-infra/k8s/jobs/create-table.sh
new file mode 100755
index 0000000..9867872
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/jobs/create-table.sh
@@ -0,0 +1,165 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+
+# Script to create Fluss table with specified number of buckets
+# This should be run before deploying the producer
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# Default values
+NAMESPACE="${NAMESPACE:-fluss}"
+BOOTSTRAP="${BOOTSTRAP:-coordinator-server-hs.fluss.svc.cluster.local:9124}"
+DATABASE="${DATABASE:-iot}"
+TABLE="${TABLE:-sensor_readings}"
+BUCKETS="${BUCKETS:-128}"
+DEMO_IMAGE_REPO="${DEMO_IMAGE_REPO:-343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo}"
+DEMO_IMAGE_TAG="${DEMO_IMAGE_TAG:-latest}"
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --namespace)
+            NAMESPACE="$2"
+            shift 2
+            ;;
+        --bootstrap)
+            BOOTSTRAP="$2"
+            shift 2
+            ;;
+        --database)
+            DATABASE="$2"
+            shift 2
+            ;;
+        --table)
+            TABLE="$2"
+            shift 2
+            ;;
+        --buckets)
+            BUCKETS="$2"
+            shift 2
+            ;;
+        --image-repo)
+            DEMO_IMAGE_REPO="$2"
+            shift 2
+            ;;
+        --image-tag)
+            DEMO_IMAGE_TAG="$2"
+            shift 2
+            ;;
+        --help)
+            echo "Usage: $0 [OPTIONS]"
+            echo ""
+            echo "Options:"
+            echo "  --namespace NAMESPACE     Kubernetes namespace (default: fluss)"
+            echo "  --bootstrap BOOTSTRAP     Fluss coordinator address (default: coordinator-server-hs.fluss.svc.cluster.local:9124)"
+            echo "  --database DATABASE       Database name (default: iot)"
+            echo "  --table TABLE             Table name (default: sensor_readings)"
+            echo "  --buckets BUCKETS         Number of buckets (default: 128)"
+            echo "  --image-repo REPO         Docker image repository (required, no default)"
+            echo "  --image-tag TAG           Docker image tag (default: latest)"
+            echo "  --help                    Show this help message"
+            echo ""
+            echo "Environment variables:"
+            echo "  NAMESPACE, BOOTSTRAP, DATABASE, TABLE, BUCKETS, DEMO_IMAGE_REPO, DEMO_IMAGE_TAG"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+echo "=== Creating Fluss Table with ${BUCKETS} Buckets ==="
+echo "  Namespace: ${NAMESPACE}"
+echo "  Bootstrap: ${BOOTSTRAP}"
+echo "  Database: ${DATABASE}"
+echo "  Table: ${TABLE}"
+echo "  Buckets: ${BUCKETS}"
+echo "  Image: ${DEMO_IMAGE_REPO}:${DEMO_IMAGE_TAG}"
+echo ""
+
+# Export variables for envsubst
+export NAMESPACE BOOTSTRAP DATABASE TABLE BUCKETS DEMO_IMAGE_REPO DEMO_IMAGE_TAG
+
+# Check if namespace exists
+if ! kubectl get namespace "${NAMESPACE}" &>/dev/null; then
+    echo "ERROR: Namespace '${NAMESPACE}' does not exist"
+    exit 1
+fi
+
+# Delete existing job if it exists (to allow re-running)
+echo "Cleaning up any existing create-table job..."
+kubectl delete job -n "${NAMESPACE}" fluss-create-table --ignore-not-found=true
+
+# Wait a moment for cleanup
+sleep 2
+
+# Apply the job YAML
+echo "Creating table creation job..."
+envsubst < "${K8S_DIR}/jobs/create-table-job.yaml" | kubectl apply -f -
+
+# Wait for job to complete
+echo ""
+echo "Waiting for job to complete..."
+JOB_NAME="fluss-create-table"
+
+# Wait for job to start
+TIMEOUT=60
+ELAPSED=0
+while ! kubectl get job -n "${NAMESPACE}" "${JOB_NAME}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; do
+    if kubectl get job -n "${NAMESPACE}" "${JOB_NAME}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null | grep -q "True"; then
+        echo ""
+        echo "ERROR: Job failed!"
+        echo "Job logs:"
+        kubectl logs -n "${NAMESPACE}" -l app=fluss-setup,component=table-creator --tail=50
+        exit 1
+    fi
+    
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo ""
+        echo "ERROR: Job did not complete within ${TIMEOUT} seconds"
+        echo "Job status:"
+        kubectl get job -n "${NAMESPACE}" "${JOB_NAME}"
+        echo ""
+        echo "Job logs:"
+        kubectl logs -n "${NAMESPACE}" -l app=fluss-setup,component=table-creator --tail=50
+        exit 1
+    fi
+    
+    sleep 2
+    ELAPSED=$((ELAPSED + 2))
+    echo -n "."
+done
+
+echo ""
+echo "✓ Job completed successfully!"
+echo ""
+echo "Job logs:"
+kubectl logs -n "${NAMESPACE}" -l app=fluss-setup,component=table-creator --tail=50
+
+echo ""
+echo "=== Table Creation Complete ==="
+echo "Table '${DATABASE}.${TABLE}' is ready with ${BUCKETS} buckets"
+echo "You can now deploy the producer to use this table"
+
diff --git a/e2e-iot/high-infra/k8s/jobs/deploy-producer-multi-instance.sh b/e2e-iot/high-infra/k8s/jobs/deploy-producer-multi-instance.sh
new file mode 100755
index 0000000..065e849
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/jobs/deploy-producer-multi-instance.sh
@@ -0,0 +1,244 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -e
+
+# Deploy multiple producer instances (8 total, 2 per node across 4 nodes)
+# This script deploys 8 producer jobs with instance IDs 0-7
+# Topology spread constraints ensure 2 pods run per producer node
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Configuration
+export NAMESPACE="${NAMESPACE:-fluss}"
+export TOTAL_PRODUCERS="${TOTAL_PRODUCERS:-8}"
+export PRODUCER_RATE="${PRODUCER_RATE:-250000}"
+export PRODUCER_FLUSH_EVERY="${PRODUCER_FLUSH_EVERY:-5000}"
+export PRODUCER_STATS_EVERY="${PRODUCER_STATS_EVERY:-50000}"
+export CLIENT_WRITER_BATCH_TIMEOUT="${CLIENT_WRITER_BATCH_TIMEOUT:-90ms}"
+export CLIENT_WRITER_BUFFER_MEMORY_SIZE="${CLIENT_WRITER_BUFFER_MEMORY_SIZE:-2gb}"
+export CLIENT_WRITER_BATCH_SIZE="${CLIENT_WRITER_BATCH_SIZE:-128mb}"
+export PRODUCER_MEMORY_REQUEST="${PRODUCER_MEMORY_REQUEST:-4Gi}"
+export PRODUCER_MEMORY_LIMIT="${PRODUCER_MEMORY_LIMIT:-16Gi}"
+export PRODUCER_CPU_REQUEST="${PRODUCER_CPU_REQUEST:-2000m}"
+export PRODUCER_CPU_LIMIT="${PRODUCER_CPU_LIMIT:-8000m}"
+export BOOTSTRAP="${BOOTSTRAP:-coordinator-server-hs.fluss.svc.cluster.local:9124}"
+export DATABASE="${DATABASE:-iot}"
+export TABLE="${TABLE:-sensor_readings}"
+export BUCKETS="${BUCKETS:-128}"
+export NUM_WRITER_THREADS="${NUM_WRITER_THREADS:-48}"
+export DEMO_IMAGE_REPO="${DEMO_IMAGE_REPO:-343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo}"
+export DEMO_IMAGE_TAG="${DEMO_IMAGE_TAG:-latest}"
+
+# Parse command line arguments
+WAIT_FOR_READY=false
+SHOW_LOGS=false
+CLEANUP_EXISTING=true
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --namespace)
+            export NAMESPACE="$2"
+            shift 2
+            ;;
+        --rate)
+            export PRODUCER_RATE="$2"
+            shift 2
+            ;;
+        --total-producers)
+            export TOTAL_PRODUCERS="$2"
+            shift 2
+            ;;
+        --no-cleanup)
+            CLEANUP_EXISTING=false
+            shift
+            ;;
+        --wait)
+            WAIT_FOR_READY=true
+            shift
+            ;;
+        --logs)
+            SHOW_LOGS=true
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [options]"
+            echo ""
+            echo "Deploys ${TOTAL_PRODUCERS} producer instances (2 per node across 4 nodes)"
+            echo ""
+            echo "Options:"
+            echo "  --namespace NAMESPACE          Kubernetes namespace (default: fluss)"
+            echo "  --rate RATE                    Records per second per instance (default: 250000)"
+            echo "  --total-producers COUNT         Total number of producer instances (default: 8)"
+            echo "  --no-cleanup                   Don't delete existing producer jobs before deploying"
+            echo "  --wait                         Wait for all pods to be ready"
+            echo "  --logs                         Show logs after deployment"
+            echo ""
+            echo "This script will:"
+            echo "  1. Delete existing producer jobs (unless --no-cleanup)"
+            echo "  2. Deploy ${TOTAL_PRODUCERS} producer jobs with instance IDs 0-$((TOTAL_PRODUCERS-1))"
+            echo "  3. Topology spread constraints ensure 2 pods per producer node"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Check if namespace exists
+if ! kubectl get namespace "${NAMESPACE}" &>/dev/null; then
+    echo "ERROR: Namespace ${NAMESPACE} does not exist"
+    exit 1
+fi
+
+# Check if producer nodes exist
+PRODUCER_NODES=$(kubectl get nodes -l node-type=producer --no-headers 2>/dev/null | wc -l | tr -d ' ')
+if [ "${PRODUCER_NODES}" -eq 0 ]; then
+    echo "ERROR: No producer nodes found. Please deploy infrastructure first."
+    echo "Expected 4 producer nodes with label node-type=producer"
+    exit 1
+fi
+
+if [ "${PRODUCER_NODES}" -lt 4 ]; then
+    echo "WARNING: Found only ${PRODUCER_NODES} producer nodes, expected 4"
+    echo "Topology spread may not work as expected"
+fi
+
+echo "=== Deploying ${TOTAL_PRODUCERS} Producer Instances ==="
+echo "Namespace: ${NAMESPACE}"
+echo "Producer Nodes Available: ${PRODUCER_NODES}"
+echo "Image: ${DEMO_IMAGE_REPO}:${DEMO_IMAGE_TAG}"
+echo "Rate per instance: ${PRODUCER_RATE} records/sec"
+echo "Total rate: $((PRODUCER_RATE * TOTAL_PRODUCERS)) records/sec"
+echo "Instances: 0-$((TOTAL_PRODUCERS-1))"
+echo ""
+
+# Cleanup existing producer jobs
+if [ "${CLEANUP_EXISTING}" = true ]; then
+    echo "[1/3] Cleaning up existing producer jobs..."
+    EXISTING_JOBS=$(kubectl get jobs -n "${NAMESPACE}" -l app=fluss-producer -o name 2>/dev/null || echo "")
+    if [ -n "${EXISTING_JOBS}" ]; then
+        echo "${EXISTING_JOBS}" | xargs -r kubectl delete -n "${NAMESPACE}"
+        echo "  ✓ Existing jobs deleted"
+        sleep 3
+    else
+        echo "  ℹ No existing jobs found"
+    fi
+    echo ""
+fi
+
+# Deploy producer jobs
+echo "[2/3] Deploying ${TOTAL_PRODUCERS} producer jobs..."
+for INSTANCE_ID in $(seq 0 $((TOTAL_PRODUCERS-1))); do
+    export INSTANCE_ID="${INSTANCE_ID}"
+    JOB_NAME="fluss-producer-${INSTANCE_ID}"
+    
+    echo "  Deploying instance ${INSTANCE_ID} (job: ${JOB_NAME})..."
+    
+    # Create job YAML with instance-specific name
+    envsubst < "${SCRIPT_DIR}/producer-job.yaml" | \
+        sed "s/name: fluss-producer/name: ${JOB_NAME}/" | \
+        kubectl apply -f - > /dev/null
+    
+    if [ $? -eq 0 ]; then
+        echo "    ✓ Instance ${INSTANCE_ID} deployed"
+    else
+        echo "    ✗ Failed to deploy instance ${INSTANCE_ID}"
+        exit 1
+    fi
+done
+echo ""
+
+# Wait for pods to be ready if requested
+if [ "${WAIT_FOR_READY}" = true ]; then
+    echo "[3/3] Waiting for producer pods to be ready..."
+    TIMEOUT=600
+    ELAPSED=0
+    while [ $ELAPSED -lt $TIMEOUT ]; do
+        READY_PODS=$(kubectl get pods -n "${NAMESPACE}" -l app=fluss-producer --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')
+        TOTAL_PODS=$(kubectl get pods -n "${NAMESPACE}" -l app=fluss-producer --no-headers 2>/dev/null | wc -l | tr -d ' ')
+        
+        if [ "${READY_PODS}" -eq "${TOTAL_PRODUCERS}" ] && [ "${TOTAL_PODS}" -eq "${TOTAL_PRODUCERS}" ]; then
+            echo "  ✓ All ${TOTAL_PRODUCERS} producer pods are ready"
+            break
+        fi
+        
+        echo "  Waiting... (${READY_PODS}/${TOTAL_PRODUCERS} pods ready)"
+        sleep 5
+        ELAPSED=$((ELAPSED + 5))
+    done
+    
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo "  ⚠ Timeout waiting for all pods to be ready"
+        echo "  Check status: kubectl get pods -n ${NAMESPACE} -l app=fluss-producer"
+    fi
+    echo ""
+fi
+
+# Show deployment status
+echo "=== Deployment Status ==="
+echo ""
+echo "Producer Jobs:"
+kubectl get jobs -n "${NAMESPACE}" -l app=fluss-producer
+echo ""
+echo "Producer Pods:"
+kubectl get pods -n "${NAMESPACE}" -l app=fluss-producer -o wide
+echo ""
+
+# Show pod distribution across nodes
+echo "Pod Distribution Across Nodes:"
+kubectl get pods -n "${NAMESPACE}" -l app=fluss-producer -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}{end}' | \
+    sort -k2 | \
+    awk '{nodes[$2]++} END {for (node in nodes) print "  " node ": " nodes[node] " pod(s)"}'
+echo ""
+
+# Show logs if requested
+if [ "${SHOW_LOGS}" = true ]; then
+    echo "=== Producer Logs (last 20 lines per instance) ==="
+    for INSTANCE_ID in $(seq 0 $((TOTAL_PRODUCERS-1))); do
+        POD=$(kubectl get pod -n "${NAMESPACE}" -l app=fluss-producer --field-selector=status.phase=Running -o jsonpath="{.items[?(@.metadata.labels['job-name']=='fluss-producer-${INSTANCE_ID}')].metadata.name}" 2>/dev/null || echo "")
+        if [ -n "${POD}" ]; then
+            echo "--- Instance ${INSTANCE_ID} (${POD}) ---"
+            kubectl logs -n "${NAMESPACE}" "${POD}" --tail=20 2>/dev/null || echo "  Could not retrieve logs"
+            echo ""
+        fi
+    done
+fi
+
+echo "=== Deployment Complete ==="
+echo ""
+echo "Monitor producers:"
+echo "  kubectl get pods -n ${NAMESPACE} -l app=fluss-producer -o wide"
+echo "  kubectl logs -n ${NAMESPACE} -l app=fluss-producer -f"
+echo ""
+echo "Check pod distribution:"
+echo "  kubectl get pods -n ${NAMESPACE} -l app=fluss-producer -o jsonpath='{range .items[*]}{.metadata.name}{\"\\t\"}{.spec.nodeName}{\"\\n\"}{end}' | sort -k2"
+echo ""
+echo "Delete all producer jobs:"
+echo "  kubectl delete jobs -n ${NAMESPACE} -l app=fluss-producer"
+echo ""
+
diff --git a/e2e-iot/high-infra/k8s/jobs/deploy-producer-optimal.sh b/e2e-iot/high-infra/k8s/jobs/deploy-producer-optimal.sh
new file mode 100755
index 0000000..b9fdab1
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/jobs/deploy-producer-optimal.sh
@@ -0,0 +1,212 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -e
+
+# Deploy producer with optimal performance configuration
+# This script uses the optimal settings discovered through performance testing:
+# - Batch timeout: 90ms (or 50ms) for optimal throughput
+# - Flush interval: 5000 records (balances latency vs throughput)
+# - Rate: 200000 records/sec
+# - Buffer: 2gb, Batch: 128mb
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Optimal performance configuration
+export NAMESPACE="${NAMESPACE:-fluss}"
+export PRODUCER_RATE="${PRODUCER_RATE:-200000}"
+export PRODUCER_FLUSH_EVERY="${PRODUCER_FLUSH_EVERY:-5000}"
+export PRODUCER_STATS_EVERY="${PRODUCER_STATS_EVERY:-50000}"
+export CLIENT_WRITER_BATCH_TIMEOUT="${CLIENT_WRITER_BATCH_TIMEOUT:-90ms}"
+export CLIENT_WRITER_BUFFER_MEMORY_SIZE="${CLIENT_WRITER_BUFFER_MEMORY_SIZE:-2gb}"
+export CLIENT_WRITER_BATCH_SIZE="${CLIENT_WRITER_BATCH_SIZE:-128mb}"
+export PRODUCER_MEMORY_REQUEST="${PRODUCER_MEMORY_REQUEST:-4Gi}"
+export PRODUCER_MEMORY_LIMIT="${PRODUCER_MEMORY_LIMIT:-16Gi}"
+export PRODUCER_CPU_REQUEST="${PRODUCER_CPU_REQUEST:-2000m}"
+export PRODUCER_CPU_LIMIT="${PRODUCER_CPU_LIMIT:-8000m}"
+export BOOTSTRAP="${BOOTSTRAP:-coordinator-server-hs.fluss.svc.cluster.local:9124}"
+export DATABASE="${DATABASE:-iot}"
+export TABLE="${TABLE:-sensor_readings}"
+export BUCKETS="${BUCKETS:-48}"
+export TOTAL_PRODUCERS="${TOTAL_PRODUCERS:-1}"
+export INSTANCE_ID="${INSTANCE_ID:-0}"
+export NUM_WRITER_THREADS="${NUM_WRITER_THREADS:-48}"
+export DEMO_IMAGE_REPO="${DEMO_IMAGE_REPO:-343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo}"
+export DEMO_IMAGE_TAG="${DEMO_IMAGE_TAG:-latest}"
+
+# Parse command line arguments for overrides
+WAIT_FOR_READY=false
+SHOW_LOGS=false
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --namespace)
+            export NAMESPACE="$2"
+            shift 2
+            ;;
+        --rate)
+            export PRODUCER_RATE="$2"
+            shift 2
+            ;;
+        --flush)
+            export PRODUCER_FLUSH_EVERY="$2"
+            shift 2
+            ;;
+        --batch-timeout)
+            export CLIENT_WRITER_BATCH_TIMEOUT="$2"
+            shift 2
+            ;;
+        --buffer-size)
+            export CLIENT_WRITER_BUFFER_MEMORY_SIZE="$2"
+            shift 2
+            ;;
+        --batch-size)
+            export CLIENT_WRITER_BATCH_SIZE="$2"
+            shift 2
+            ;;
+        --wait)
+            WAIT_FOR_READY=true
+            shift
+            ;;
+        --logs)
+            SHOW_LOGS=true
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [options]"
+            echo ""
+            echo "Deploys producer with optimal performance configuration:"
+            echo "  Rate: 200,000 records/sec"
+            echo "  Flush: every 5,000 records"
+            echo "  Batch timeout: 90ms"
+            echo "  Buffer: 2gb, Batch: 128mb"
+            echo ""
+            echo "Options:"
+            echo "  --namespace NAMESPACE          Kubernetes namespace (default: fluss)"
+            echo "  --rate RATE                    Records per second (default: 200000)"
+            echo "  --flush FLUSH                  Flush every N records (default: 5000)"
+            echo "  --batch-timeout TIMEOUT         Batch timeout (default: 90ms)"
+            echo "  --buffer-size SIZE             Writer buffer memory size (default: 2gb)"
+            echo "  --batch-size SIZE              Writer batch size (default: 128mb)"
+            echo "  --wait                         Wait for job to be ready"
+            echo "  --logs                         Show logs after deployment"
+            echo ""
+            echo "Environment variables can also be used to override defaults."
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Check if namespace exists
+if ! kubectl get namespace "${NAMESPACE}" &>/dev/null; then
+    echo "ERROR: Namespace ${NAMESPACE} does not exist"
+    exit 1
+fi
+
+echo "=== Deploying Producer with Optimal Configuration ==="
+echo "Namespace: ${NAMESPACE}"
+echo "Image: ${DEMO_IMAGE_REPO}:${DEMO_IMAGE_TAG}"
+echo "Rate: ${PRODUCER_RATE} records/sec"
+echo "Flush: every ${PRODUCER_FLUSH_EVERY} records"
+echo "Batch Timeout: ${CLIENT_WRITER_BATCH_TIMEOUT}"
+echo "Buffer Size: ${CLIENT_WRITER_BUFFER_MEMORY_SIZE}"
+echo "Batch Size: ${CLIENT_WRITER_BATCH_SIZE}"
+echo "Memory: ${PRODUCER_MEMORY_REQUEST} request, ${PRODUCER_MEMORY_LIMIT} limit"
+echo "CPU: ${PRODUCER_CPU_REQUEST} request, ${PRODUCER_CPU_LIMIT} limit"
+echo "Bootstrap: ${BOOTSTRAP}"
+echo "Database: ${DATABASE}"
+echo "Table: ${TABLE}"
+echo "Buckets: ${BUCKETS}"
+echo "Writer Threads: ${NUM_WRITER_THREADS}"
+echo ""
+
+# Always delete existing job before deploying
+echo "[1/4] Deleting existing producer job (if any)..."
+EXISTING_JOB=$(kubectl get job -n "${NAMESPACE}" fluss-producer -o name 2>/dev/null || echo "")
+if [ -n "${EXISTING_JOB}" ]; then
+    kubectl delete job -n "${NAMESPACE}" fluss-producer
+    echo "  ✓ Existing job deleted"
+    sleep 2
+else
+    echo "  ℹ No existing job found"
+fi
+echo ""
+
+# Deploy producer job
+echo "[2/4] Deploying producer job..."
+# Use envsubst to substitute variables in the YAML
+envsubst < "${SCRIPT_DIR}/producer-job.yaml" | kubectl apply -f -
+
+echo "  ✓ Producer job deployed"
+echo ""
+
+# Wait for job to be ready if requested
+if [ "${WAIT_FOR_READY}" = true ]; then
+    echo "[3/4] Waiting for producer pod to be ready..."
+    if kubectl wait --for=condition=ready pod -l app=fluss-producer -n "${NAMESPACE}" --timeout=300s 2>/dev/null; then
+        echo "  ✓ Producer pod is ready"
+    else
+        echo "  ⚠ Timeout waiting for producer pod to be ready"
+        echo "  Check status: kubectl get pods -n ${NAMESPACE} -l app=fluss-producer"
+    fi
+    echo ""
+fi
+
+# Show job status
+echo "[4/4] Producer job status:"
+kubectl get job -n "${NAMESPACE}" fluss-producer
+echo ""
+kubectl get pods -n "${NAMESPACE}" -l app=fluss-producer
+echo ""
+
+# Show logs if requested
+if [ "${SHOW_LOGS}" = true ]; then
+    PRODUCER_POD=$(kubectl get pod -n "${NAMESPACE}" -l app=fluss-producer -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+    if [ -n "${PRODUCER_POD}" ]; then
+        echo "=== Producer Logs (last 50 lines) ==="
+        kubectl logs -n "${NAMESPACE}" "${PRODUCER_POD}" --tail=50 || echo "  Could not retrieve logs"
+        echo ""
+    fi
+fi
+
+echo "=== Deployment Complete ==="
+echo ""
+echo "Monitor producer:"
+echo "  kubectl get pods -n ${NAMESPACE} -l app=fluss-producer"
+echo "  kubectl logs -n ${NAMESPACE} -l app=fluss-producer -f"
+echo ""
+echo "View producer metrics:"
+echo "  kubectl port-forward -n ${NAMESPACE} svc/fluss-producer-metrics 8080:8080"
+echo "  Then open: http://localhost:8080/metrics"
+echo ""
+echo "Delete producer job:"
+echo "  kubectl delete job -n ${NAMESPACE} fluss-producer"
+echo ""
+
+
diff --git a/e2e-iot/high-infra/k8s/jobs/deploy-producer.sh b/e2e-iot/high-infra/k8s/jobs/deploy-producer.sh
new file mode 100755
index 0000000..473a4cb
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/jobs/deploy-producer.sh
@@ -0,0 +1,313 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -e
+
+# Deploy producer job to Kubernetes
+# Usage: ./deploy-producer.sh [options]
+# Options:
+#   --namespace NAMESPACE          Kubernetes namespace (default: fluss)
+#   --image IMAGE                  Docker image (default: from ECR or env)
+#   --rate RATE                    Records per second (default: 2000)
+#   --flush FLUSH                  Flush every N records (default: 20000)
+#   --stats STATS                  Stats every N records (default: 1000)
+#   --buffer-size SIZE             Writer buffer memory size (default: 128mb)
+#   --batch-size SIZE              Writer batch size (default: 16mb)
+#   --memory-request SIZE          Memory request (default: 2Gi)
+#   --memory-limit SIZE            Memory limit (default: 8Gi)
+#   --cpu-request SIZE             CPU request (default: 1000m)
+#   --cpu-limit SIZE               CPU limit (default: 4000m)
+#   --bootstrap BOOTSTRAP          Fluss coordinator address (default: coordinator-server-hs.fluss.svc.cluster.local:9124)
+#   --database DATABASE            Database name (default: iot)
+#   --table TABLE                  Table name (default: sensor_readings)
+#   --buckets BUCKETS              Number of buckets (default: 12)
+#   --wait                         Wait for job to be ready
+#   --logs                         Show logs after deployment
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+NAMESPACE="${NAMESPACE:-fluss}"
+# Optimal performance defaults (can be overridden via env vars or command line)
+PRODUCER_RATE="${PRODUCER_RATE:-200000}"
+PRODUCER_FLUSH_EVERY="${PRODUCER_FLUSH_EVERY:-5000}"
+PRODUCER_STATS_EVERY="${PRODUCER_STATS_EVERY:-50000}"
+CLIENT_WRITER_BATCH_TIMEOUT="${CLIENT_WRITER_BATCH_TIMEOUT:-90ms}"
+CLIENT_WRITER_BUFFER_MEMORY_SIZE="${CLIENT_WRITER_BUFFER_MEMORY_SIZE:-2gb}"
+CLIENT_WRITER_BATCH_SIZE="${CLIENT_WRITER_BATCH_SIZE:-128mb}"
+PRODUCER_MEMORY_REQUEST="${PRODUCER_MEMORY_REQUEST:-4Gi}"
+PRODUCER_MEMORY_LIMIT="${PRODUCER_MEMORY_LIMIT:-16Gi}"
+PRODUCER_CPU_REQUEST="${PRODUCER_CPU_REQUEST:-2000m}"
+PRODUCER_CPU_LIMIT="${PRODUCER_CPU_LIMIT:-8000m}"
+BOOTSTRAP="${BOOTSTRAP:-coordinator-server-hs.fluss.svc.cluster.local:9124}"
+DATABASE="${DATABASE:-iot}"
+TABLE="${TABLE:-sensor_readings}"
+BUCKETS="${BUCKETS:-48}"
+TOTAL_PRODUCERS="${TOTAL_PRODUCERS:-1}"
+INSTANCE_ID="${INSTANCE_ID:-0}"
+NUM_WRITER_THREADS="${NUM_WRITER_THREADS:-48}"
+DEMO_IMAGE_REPO="${DEMO_IMAGE_REPO:-343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo}"
+DEMO_IMAGE_TAG="${DEMO_IMAGE_TAG:-latest}"
+WAIT_FOR_READY=false
+SHOW_LOGS=false
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --namespace)
+            NAMESPACE="$2"
+            shift 2
+            ;;
+        --image)
+            if [[ "$2" == *":"* ]]; then
+                DEMO_IMAGE_REPO="${2%%:*}"
+                DEMO_IMAGE_TAG="${2##*:}"
+            else
+                DEMO_IMAGE_REPO="$2"
+            fi
+            shift 2
+            ;;
+        --rate)
+            PRODUCER_RATE="$2"
+            shift 2
+            ;;
+        --flush)
+            PRODUCER_FLUSH_EVERY="$2"
+            shift 2
+            ;;
+        --stats)
+            PRODUCER_STATS_EVERY="$2"
+            shift 2
+            ;;
+        --buffer-size)
+            CLIENT_WRITER_BUFFER_MEMORY_SIZE="$2"
+            shift 2
+            ;;
+        --batch-size)
+            CLIENT_WRITER_BATCH_SIZE="$2"
+            shift 2
+            ;;
+        --memory-request)
+            PRODUCER_MEMORY_REQUEST="$2"
+            shift 2
+            ;;
+        --memory-limit)
+            PRODUCER_MEMORY_LIMIT="$2"
+            shift 2
+            ;;
+        --cpu-request)
+            PRODUCER_CPU_REQUEST="$2"
+            shift 2
+            ;;
+        --cpu-limit)
+            PRODUCER_CPU_LIMIT="$2"
+            shift 2
+            ;;
+        --bootstrap)
+            BOOTSTRAP="$2"
+            shift 2
+            ;;
+        --database)
+            DATABASE="$2"
+            shift 2
+            ;;
+        --table)
+            TABLE="$2"
+            shift 2
+            ;;
+        --buckets)
+            BUCKETS="$2"
+            shift 2
+            ;;
+        --total-producers)
+            TOTAL_PRODUCERS="$2"
+            shift 2
+            ;;
+        --instance-id)
+            INSTANCE_ID="$2"
+            shift 2
+            ;;
+        --writer-threads)
+            NUM_WRITER_THREADS="$2"
+            shift 2
+            ;;
+        --wait)
+            WAIT_FOR_READY=true
+            shift
+            ;;
+        --logs)
+            SHOW_LOGS=true
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [options]"
+            echo ""
+            echo "Options:"
+            echo "  --namespace NAMESPACE          Kubernetes namespace (default: fluss)"
+            echo "  --image IMAGE                  Docker image (default: from ECR or env)"
+            echo "  --rate RATE                    Records per second (default: 2000)"
+            echo "  --flush FLUSH                  Flush every N records (default: 20000)"
+            echo "  --stats STATS                  Stats every N records (default: 1000)"
+            echo "  --buffer-size SIZE             Writer buffer memory size (default: 128mb)"
+            echo "  --batch-size SIZE              Writer batch size (default: 16mb)"
+            echo "  --memory-request SIZE           Memory request (default: 2Gi)"
+            echo "  --memory-limit SIZE            Memory limit (default: 8Gi)"
+            echo "  --cpu-request SIZE             CPU request (default: 1000m)"
+            echo "  --cpu-limit SIZE               CPU limit (default: 4000m)"
+            echo "  --bootstrap BOOTSTRAP          Fluss coordinator address"
+            echo "  --database DATABASE            Database name (default: iot)"
+            echo "  --table TABLE                  Table name (default: sensor_readings)"
+            echo "  --buckets BUCKETS              Number of buckets (default: 48)"
+            echo "  --total-producers COUNT        Total producer instances (default: 1)"
+            echo "  --instance-id ID               Instance ID (0-based, default: 0)"
+            echo "  --writer-threads THREADS       Number of writer threads (default: 8)"
+            echo "  --wait                         Wait for job to be ready"
+            echo "  --logs                         Show logs after deployment"
+            echo ""
+            echo "Environment variables:"
+            echo "  NAMESPACE, PRODUCER_RATE, PRODUCER_FLUSH_EVERY, PRODUCER_STATS_EVERY"
+            echo "  CLIENT_WRITER_BUFFER_MEMORY_SIZE, CLIENT_WRITER_BATCH_SIZE"
+            echo "  PRODUCER_MEMORY_REQUEST, PRODUCER_MEMORY_LIMIT"
+            echo "  PRODUCER_CPU_REQUEST, PRODUCER_CPU_LIMIT"
+            echo "  BOOTSTRAP, DATABASE, TABLE, BUCKETS"
+            echo "  TOTAL_PRODUCERS, INSTANCE_ID, NUM_WRITER_THREADS"
+            echo "  DEMO_IMAGE_REPO, DEMO_IMAGE_TAG"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Check if namespace exists
+if ! kubectl get namespace "${NAMESPACE}" &>/dev/null; then
+    echo "ERROR: Namespace ${NAMESPACE} does not exist"
+    exit 1
+fi
+
+echo "=== Deploying Producer Job ==="
+echo "Namespace: ${NAMESPACE}"
+echo "Image: ${DEMO_IMAGE_REPO}:${DEMO_IMAGE_TAG}"
+echo "Rate: ${PRODUCER_RATE} records/sec"
+echo "Flush: every ${PRODUCER_FLUSH_EVERY} records"
+echo "Stats: every ${PRODUCER_STATS_EVERY} records"
+echo "Buffer Size: ${CLIENT_WRITER_BUFFER_MEMORY_SIZE}"
+echo "Batch Size: ${CLIENT_WRITER_BATCH_SIZE}"
+echo "Memory: ${PRODUCER_MEMORY_REQUEST} request, ${PRODUCER_MEMORY_LIMIT} limit"
+echo "CPU: ${PRODUCER_CPU_REQUEST} request, ${PRODUCER_CPU_LIMIT} limit"
+echo "Bootstrap: ${BOOTSTRAP}"
+echo "Database: ${DATABASE}"
+echo "Table: ${TABLE}"
+echo "Buckets: ${BUCKETS}"
+echo "Total Producers: ${TOTAL_PRODUCERS}"
+echo "Instance ID: ${INSTANCE_ID}"
+echo "Writer Threads: ${NUM_WRITER_THREADS}"
+echo ""
+
+# Always delete existing job before deploying
+echo "[1/4] Deleting existing producer job (if any)..."
+EXISTING_JOB=$(kubectl get job -n "${NAMESPACE}" fluss-producer -o name 2>/dev/null || echo "")
+if [ -n "${EXISTING_JOB}" ]; then
+    kubectl delete job -n "${NAMESPACE}" fluss-producer
+    echo "  ✓ Existing job deleted"
+    # Wait a moment for the job and pods to be fully deleted
+    sleep 2
+else
+    echo "  ℹ No existing job found"
+fi
+echo ""
+
+# Deploy producer job
+echo "[2/4] Deploying producer job..."
+export NAMESPACE
+export DEMO_IMAGE_REPO
+export DEMO_IMAGE_TAG
+export PRODUCER_RATE
+export PRODUCER_FLUSH_EVERY
+export PRODUCER_STATS_EVERY
+export CLIENT_WRITER_BUFFER_MEMORY_SIZE
+export CLIENT_WRITER_BATCH_SIZE
+export CLIENT_WRITER_BATCH_TIMEOUT
+export PRODUCER_MEMORY_REQUEST
+export PRODUCER_MEMORY_LIMIT
+export PRODUCER_CPU_REQUEST
+export PRODUCER_CPU_LIMIT
+export BOOTSTRAP
+export DATABASE
+export TABLE
+export BUCKETS
+export TOTAL_PRODUCERS
+export INSTANCE_ID
+export NUM_WRITER_THREADS
+
+# Use envsubst to substitute variables in the YAML
+envsubst < "${SCRIPT_DIR}/producer-job.yaml" | kubectl apply -f -
+
+echo "  ✓ Producer job deployed"
+echo ""
+
+# Wait for job to be ready if requested
+if [ "${WAIT_FOR_READY}" = true ]; then
+    echo "[3/4] Waiting for producer pod to be ready..."
+    if kubectl wait --for=condition=ready pod -l app=fluss-producer -n "${NAMESPACE}" --timeout=300s 2>/dev/null; then
+        echo "  ✓ Producer pod is ready"
+    else
+        echo "  ⚠ Timeout waiting for producer pod to be ready"
+        echo "  Check status: kubectl get pods -n ${NAMESPACE} -l app=fluss-producer"
+    fi
+    echo ""
+fi
+
+# Show job status
+echo "[4/4] Producer job status:"
+kubectl get job -n "${NAMESPACE}" fluss-producer
+echo ""
+kubectl get pods -n "${NAMESPACE}" -l app=fluss-producer
+echo ""
+
+# Show logs if requested
+if [ "${SHOW_LOGS}" = true ]; then
+    PRODUCER_POD=$(kubectl get pod -n "${NAMESPACE}" -l app=fluss-producer -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+    if [ -n "${PRODUCER_POD}" ]; then
+        echo "=== Producer Logs (last 50 lines) ==="
+        kubectl logs -n "${NAMESPACE}" "${PRODUCER_POD}" --tail=50 || echo "  Could not retrieve logs"
+        echo ""
+    fi
+fi
+
+echo "=== Deployment Complete ==="
+echo ""
+echo "Monitor producer:"
+echo "  kubectl get pods -n ${NAMESPACE} -l app=fluss-producer"
+echo "  kubectl logs -n ${NAMESPACE} -l app=fluss-producer -f"
+echo ""
+echo "View producer metrics:"
+echo "  kubectl port-forward -n ${NAMESPACE} svc/fluss-producer-metrics 8080:8080"
+echo "  Then open: http://localhost:8080/metrics"
+echo ""
+echo "Delete producer job:"
+echo "  kubectl delete job -n ${NAMESPACE} fluss-producer"
+echo ""
+
diff --git a/e2e-iot/high-infra/k8s/jobs/flink-aggregator-job.yaml b/e2e-iot/high-infra/k8s/jobs/flink-aggregator-job.yaml
new file mode 100644
index 0000000..925a961
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/jobs/flink-aggregator-job.yaml
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# NOTE: This file is deprecated - use flink-job-submission-simple.yaml instead
+# The Flink aggregator should be submitted as a job to the Flink cluster,
+# not run as a standalone application.
+# 
+# To submit the job, use:
+# kubectl apply -f flink/flink-job-submission-simple.yaml
+# (after setting DEMO_IMAGE_REPOSITORY and DEMO_IMAGE_TAG environment variables)
+
diff --git a/e2e-iot/high-infra/k8s/jobs/producer-job.yaml b/e2e-iot/high-infra/k8s/jobs/producer-job.yaml
new file mode 100644
index 0000000..4fb7612
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/jobs/producer-job.yaml
@@ -0,0 +1,150 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: fluss-producer
+  namespace: ${NAMESPACE}
+  labels:
+    app: fluss-producer
+spec:
+  backoffLimit: 0
+  completions: 1
+  parallelism: 1
+  ttlSecondsAfterFinished: 86400
+  template:
+    metadata:
+      labels:
+        app: fluss-producer
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+    spec:
+      restartPolicy: Never
+      nodeSelector:
+        node-type: producer
+      tolerations:
+        - key: producer-component
+          operator: Equal
+          value: producer
+          effect: NoSchedule
+      topologySpreadConstraints:
+        - maxSkew: 1
+          topologyKey: kubernetes.io/hostname
+          whenUnsatisfiable: DoNotSchedule
+          labelSelector:
+            matchLabels:
+              app: fluss-producer
+      initContainers:
+        - name: wait-for-fluss
+          image: busybox:1.36
+          command:
+            - sh
+            - -c
+            - |
+              echo "Waiting for Fluss coordinator to be ready..."
+              # Check if the port is open using nc (this will also verify DNS resolution)
+              until nc -zv coordinator-server-hs.fluss.svc.cluster.local 9124 2>&1 | grep -q "open"; do
+                echo "Waiting for Fluss coordinator on coordinator-server-hs.fluss.svc.cluster.local:9124..."
+                sleep 2
+              done
+              echo "Fluss coordinator is ready!"
+      containers:
+        - name: producer
+          image: ${DEMO_IMAGE_REPO}:${DEMO_IMAGE_TAG}
+          imagePullPolicy: Always
+          securityContext:
+            runAsUser: 0
+          env:
+            - name: BOOTSTRAP
+              value: "${BOOTSTRAP}"
+            - name: DATABASE
+              value: "${DATABASE}"
+            - name: TABLE
+              value: "${TABLE}"
+            - name: BUCKETS
+              value: "${BUCKETS}"
+            - name: PRODUCER_RATE
+              value: "${PRODUCER_RATE}"
+            - name: PRODUCER_FLUSH_EVERY
+              value: "${PRODUCER_FLUSH_EVERY}"
+            - name: PRODUCER_STATS_EVERY
+              value: "${PRODUCER_STATS_EVERY}"
+            - name: CLIENT_WRITER_BUFFER_MEMORY_SIZE
+              value: "${CLIENT_WRITER_BUFFER_MEMORY_SIZE}"
+            - name: CLIENT_WRITER_BATCH_SIZE
+              value: "${CLIENT_WRITER_BATCH_SIZE}"
+            - name: CLIENT_WRITER_BATCH_TIMEOUT
+              value: "${CLIENT_WRITER_BATCH_TIMEOUT}"
+            - name: TOTAL_PRODUCERS
+              value: "${TOTAL_PRODUCERS}"
+            - name: INSTANCE_ID
+              value: "${INSTANCE_ID}"
+            - name: NUM_WRITER_THREADS
+              value: "${NUM_WRITER_THREADS}"
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - |
+              java --add-opens=java.base/java.util=ALL-UNNAMED \
+                   --add-opens=java.base/java.lang=ALL-UNNAMED \
+                   --add-opens=java.base/java.nio=ALL-UNNAMED \
+                   --add-opens=java.base/java.time=ALL-UNNAMED \
+                   -cp /opt/flink/usrlib/fluss-flink-realtime-demo.jar \
+                   org.apache.fluss.benchmark.e2eplatformaws.producer.FlussSensorProducerAppMultiInstance \
+                   --bootstrap "$BOOTSTRAP" \
+                   --database "$DATABASE" \
+                   --table "$TABLE" \
+                   --buckets "$BUCKETS" \
+                   --total-producers "$TOTAL_PRODUCERS" \
+                   --instance-id "$INSTANCE_ID" \
+                   --rate "$PRODUCER_RATE" \
+                   --writer-threads "$NUM_WRITER_THREADS" \
+                   --flush "$PRODUCER_FLUSH_EVERY" \
+                   --stats "$PRODUCER_STATS_EVERY"
+          ports:
+            - name: metrics
+              containerPort: 8080
+              protocol: TCP
+          resources:
+            requests:
+              memory: "${PRODUCER_MEMORY_REQUEST}"
+              cpu: "${PRODUCER_CPU_REQUEST}"
+            limits:
+              memory: "${PRODUCER_MEMORY_LIMIT}"
+              cpu: "${PRODUCER_CPU_LIMIT}"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: fluss-producer-metrics
+  namespace: ${NAMESPACE}
+  labels:
+    app: fluss-producer
+spec:
+  selector:
+    app: fluss-producer
+  ports:
+    - name: metrics
+      port: 8080
+      targetPort: 8080
+      protocol: TCP
+  type: ClusterIP
+
diff --git a/e2e-iot/high-infra/k8s/monitor-flink-logs.sh b/e2e-iot/high-infra/k8s/monitor-flink-logs.sh
new file mode 100755
index 0000000..c186ee5
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/monitor-flink-logs.sh
@@ -0,0 +1,90 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+# Script to monitor Flink job logs for errors and failures
+# Usage: ./monitor-flink-logs.sh [job-id]
+
+set -euo pipefail
+
+NAMESPACE="${NAMESPACE:-fluss}"
+JOB_ID="${1:-06fa48f0f071363871180341f4c447e5}"
+
+echo "=== Flink Job Log Monitor ==="
+echo "Job ID: ${JOB_ID}"
+echo "Namespace: ${NAMESPACE}"
+echo ""
+echo "Monitoring logs for errors... (Press Ctrl+C to stop)"
+echo ""
+
+# Function to check job status
+check_job_status() {
+    JOBMANAGER_POD=$(kubectl get pod -n ${NAMESPACE} -l app=flink,component=jobmanager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+    if [ -n "${JOBMANAGER_POD}" ]; then
+        STATE=$(kubectl exec -n ${NAMESPACE} "${JOBMANAGER_POD}" -- curl -s "http://localhost:8081/jobs/${JOB_ID}" 2>/dev/null | python3 -c "import sys, json; print(json.load(sys.stdin).get('state', 'UNKNOWN'))" 2>/dev/null || echo "UNKNOWN")
+        echo "[$(date +'%Y-%m-%d %H:%M:%S')] Job State: ${STATE}"
+        if [ "${STATE}" != "RUNNING" ] && [ "${STATE}" != "CREATED" ]; then
+            echo "⚠️  WARNING: Job is not in RUNNING state!"
+            return 1
+        fi
+    fi
+    return 0
+}
+
+# Monitor TaskManager logs for errors
+monitor_taskmanager_logs() {
+    echo "=== TaskManager Logs (following for errors) ==="
+    kubectl logs -n ${NAMESPACE} -l app=flink,component=taskmanager -f --tail=0 2>&1 | while IFS= read -r line; do
+        if echo "$line" | grep -qiE "(error|exception|failed|fail|outofrange|timeout)"; then
+            echo "[$(date +'%Y-%m-%d %H:%M:%S')] ⚠️  ERROR DETECTED: $line"
+        fi
+    done
+}
+
+# Monitor JobManager logs for errors
+monitor_jobmanager_logs() {
+    echo "=== JobManager Logs (following for errors) ==="
+    kubectl logs -n ${NAMESPACE} -l app=flink,component=jobmanager -f --tail=0 2>&1 | while IFS= read -r line; do
+        if echo "$line" | grep -qiE "(error|exception|failed|fail|timeout)"; then
+            echo "[$(date +'%Y-%m-%d %H:%M:%S')] ⚠️  ERROR DETECTED: $line"
+        fi
+    done
+}
+
+# Check job status periodically
+(
+    while true; do
+        check_job_status
+        sleep 30
+    done
+) &
+STATUS_PID=$!
+
+# Start monitoring logs
+trap "kill $STATUS_PID 2>/dev/null; exit" INT TERM
+
+# Monitor both TaskManager and JobManager logs
+monitor_taskmanager_logs &
+TM_PID=$!
+
+monitor_jobmanager_logs &
+JM_PID=$!
+
+# Wait for any process to exit
+wait $TM_PID $JM_PID $STATUS_PID
+
+
diff --git a/e2e-iot/high-infra/k8s/monitoring/FLUSS_METRICS_SETUP.md b/e2e-iot/high-infra/k8s/monitoring/FLUSS_METRICS_SETUP.md
new file mode 100644
index 0000000..9eb3da6
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/monitoring/FLUSS_METRICS_SETUP.md
@@ -0,0 +1,203 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Fluss Coordinator & Tablet Server Metrics Setup
+
+## Overview
+
+This document explains how to populate metrics for Fluss Coordinator and Tablet Servers in the Grafana dashboard.
+
+## Current Configuration
+
+### ✅ ServiceMonitors Configured
+- **Coordinator**: `servicemonitors.yaml` includes `fluss-coordinator-metrics`
+- **Tablet Servers**: `servicemonitors.yaml` includes `fluss-tablet-metrics`
+- Both scrape port `9249`, path `/metrics`, interval `30s`
+
+### ✅ Dashboard Panels Added
+The dashboard now includes panels for:
+
+**Coordinator Metrics:**
+- Request Rate (requests/sec)
+- Error Rate (errors/sec)
+- Active Tablet Server Count
+- Table Count
+- Bucket Count
+- Request Latency (p95)
+
+**Tablet Server Metrics:**
+- Messages In Rate (messages/sec)
+- Bytes In/Out Rate (bytes/sec)
+- Replication Rates (bytes/sec)
+- Leader/Replica Counts
+
+## How to Verify Metrics Are Being Scraped
+
+### Step 1: Check ServiceMonitors
+```bash
+kubectl get servicemonitor -n fluss
+kubectl describe servicemonitor -n fluss fluss-coordinator-metrics
+kubectl describe servicemonitor -n fluss fluss-tablet-metrics
+```
+
+### Step 2: Check Prometheus Targets
+1. Port-forward Prometheus:
+   ```bash
+   kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
+   ```
+2. Open http://localhost:9090
+3. Go to **Status > Targets**
+4. Look for targets with labels:
+   - `job="fluss/fluss-coordinator-metrics"`
+   - `job="fluss/fluss-tablet-metrics"`
+5. Verify they show as **UP** (green)
+
+### Step 3: Query Metrics in Prometheus
+In Prometheus UI, try these queries to find exact metric names:
+
+**Find all Fluss coordinator metrics:**
+```promql
+{job=~"fluss.*coordinator.*"}
+```
+
+**Find all Fluss tablet metrics:**
+```promql
+{job=~"fluss.*tablet.*"}
+```
+
+**List all available metrics:**
+```promql
+{__name__=~"fluss.*"}
+```
+
+## Finding Exact Metric Names
+
+Fluss metrics may be exposed with different naming conventions. Common patterns:
+
+1. **CamelCase**: `fluss_coordinator_requestsPerSecond`
+2. **Snake_case**: `fluss_coordinator_requests_per_second`
+3. **With component prefix**: `fluss_coordinator_server_requests_per_second`
+
+The dashboard queries include fallbacks for common patterns, but you may need to adjust based on actual metric names.
+
+### How to Check Actual Metric Names
+
+1. **Directly query Fluss metrics endpoint:**
+   ```bash
+   # Get coordinator pod
+   COORD_POD=$(kubectl get pod -n fluss -l app.kubernetes.io/component=coordinator -o jsonpath='{.items[0].metadata.name}')
+   
+   # Port-forward metrics
+   kubectl port-forward -n fluss $COORD_POD 9249:9249
+   
+   # Query metrics
+   curl http://localhost:9249/metrics | grep fluss
+   ```
+
+2. **Check Prometheus metrics explorer:**
+   - In Prometheus UI, go to **Graph**
+   - Type `fluss` and use autocomplete to see available metrics
+
+## Updating Dashboard Queries
+
+If metrics don't appear, update the queries in the dashboard:
+
+1. **Edit the dashboard JSON:**
+   ```bash
+   vim aws-deploy-fluss/high-infra/k8s/monitoring/fluss-flink-dashboard.json
+   ```
+
+2. **Find the panel** (e.g., "Fluss Coordinator - Request Rate")
+
+3. **Update the `expr` field** with the correct metric name from Prometheus
+
+4. **Redeploy dashboard:**
+   ```bash
+   cd aws-deploy-fluss/high-infra/k8s/monitoring
+   ./deploy-dashboard.sh
+   ```
+
+## Common Issues
+
+### Issue: Metrics Not Appearing
+**Solution:**
+1. Verify ServiceMonitor selector matches Service labels:
+   ```bash
+   kubectl get svc -n fluss -l app.kubernetes.io/component=coordinator
+   kubectl get svc -n fluss -l app.kubernetes.io/component=tablet-server
+   ```
+
+2. Check if services have `metrics` port defined:
+   ```bash
+   kubectl get svc -n fluss coordinator-server-hs -o yaml | grep -A 5 ports
+   ```
+
+3. Verify pods expose metrics on port 9249:
+   ```bash
+   kubectl get pod -n fluss coordinator-server-0 -o yaml | grep -A 10 ports
+   ```
+
+### Issue: Wrong Metric Names
+**Solution:**
+1. Query Prometheus directly to find exact names
+2. Update dashboard JSON with correct names
+3. Redeploy dashboard
+
+### Issue: ServiceMonitor Not Picking Up Services
+**Solution:**
+1. Check ServiceMonitor namespace matches Service namespace
+2. Verify label selectors match exactly
+3. Check Prometheus operator is running:
+   ```bash
+   kubectl get pods -n monitoring | grep prometheus-operator
+   ```
+
+## Expected Metrics
+
+Based on Fluss source code, these metrics should be available:
+
+### Coordinator Metrics:
+- `requestsPerSecond` / `requests_per_second`
+- `errorsPerSecond` / `errors_per_second`
+- `activeTabletServerCount` / `active_tablet_server_count`
+- `tableCount` / `table_count`
+- `bucketCount` / `bucket_count`
+- `totalTimeMs` / `total_time_ms` (histogram for latency)
+
+### Tablet Server Metrics:
+- `messagesInPerSecond` / `messages_in_per_second`
+- `bytesInPerSecond` / `bytes_in_per_second`
+- `bytesOutPerSecond` / `bytes_out_per_second`
+- `replicationBytesInPerSecond` / `replication_bytes_in_per_second`
+- `replicationBytesOutPerSecond` / `replication_bytes_out_per_second`
+- `leaderCount` / `leader_count`
+- `replicaCount` / `replica_count`
+
+## Next Steps
+
+1. Deploy the updated dashboard:
+   ```bash
+   cd aws-deploy-fluss/high-infra/k8s/monitoring
+   ./deploy-dashboard.sh
+   ```
+
+2. Verify metrics appear in Grafana
+
+3. If metrics don't appear, check Prometheus targets and update metric names as needed
+
+4. Adjust queries based on actual metric names exposed by Fluss
diff --git a/e2e-iot/high-infra/k8s/monitoring/deploy-dashboard.sh b/e2e-iot/high-infra/k8s/monitoring/deploy-dashboard.sh
new file mode 100755
index 0000000..54d3449
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/monitoring/deploy-dashboard.sh
@@ -0,0 +1,139 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -e
+
+# Deploy Grafana dashboard via ConfigMap
+# Usage: ./deploy-dashboard.sh
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DASHBOARD_YAML="${SCRIPT_DIR}/grafana-dashboard.yaml"
+DASHBOARD_JSON="${SCRIPT_DIR}/fluss-flink-dashboard.json"
+NAMESPACE="${NAMESPACE:-monitoring}"
+
+echo "=== Deploying Grafana Dashboard ==="
+echo ""
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Check if namespace exists
+if ! kubectl get namespace "${NAMESPACE}" &>/dev/null; then
+    echo "ERROR: Namespace ${NAMESPACE} does not exist"
+    exit 1
+fi
+
+# Try to use YAML file first
+if [ -f "${DASHBOARD_YAML}" ]; then
+    echo "[1/2] Applying dashboard ConfigMap from YAML..."
+    kubectl apply -f "${DASHBOARD_YAML}"
+    
+    if [ $? -eq 0 ]; then
+        echo "  ✓ Dashboard ConfigMap deployed successfully from YAML!"
+    else
+        echo "  ⚠ Failed to deploy dashboard ConfigMap from YAML"
+        exit 1
+    fi
+elif [ -f "${DASHBOARD_JSON}" ]; then
+    echo "[1/2] Creating ConfigMap from JSON file..."
+    kubectl create configmap fluss-flink-dashboard \
+        --from-file=fluss-flink-dashboard.json="${DASHBOARD_JSON}" \
+        -n "${NAMESPACE}" \
+        --dry-run=client -o yaml | \
+    kubectl label --local -f - grafana_dashboard=1 -o yaml | \
+    kubectl apply -f -
+    
+    if [ $? -eq 0 ]; then
+        echo "  ✓ Dashboard ConfigMap deployed successfully from JSON!"
+    else
+        echo "  ⚠ Failed to deploy dashboard ConfigMap from JSON"
+        exit 1
+    fi
+else
+    echo "ERROR: Neither dashboard YAML nor JSON file found"
+    echo "  Expected: ${DASHBOARD_YAML}"
+    echo "  Or: ${DASHBOARD_JSON}"
+    exit 1
+fi
+
+echo ""
+echo "[2/3] Verifying dashboard ConfigMap..."
+sleep 2  # Wait a moment for ConfigMap to be fully available
+if kubectl get configmap -n "${NAMESPACE}" fluss-flink-dashboard &>/dev/null; then
+    kubectl get configmap -n "${NAMESPACE}" fluss-flink-dashboard
+    echo "  ✓ ConfigMap verified"
+else
+    echo "  ⚠ ConfigMap not found, but deployment may have succeeded"
+    echo "  Checking all ConfigMaps in namespace ${NAMESPACE}:"
+    kubectl get configmap -n "${NAMESPACE}" | grep -i dashboard || echo "  No dashboard ConfigMaps found"
+fi
+
+echo ""
+echo "[3/3] Importing dashboard via Grafana API..."
+GRAFANA_USER="${GRAFANA_USER:-admin}"
+GRAFANA_PASS="${GRAFANA_PASS:-admin123}"
+GRAFANA_POD=$(kubectl get pod -n "${NAMESPACE}" -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+
+if [ -z "${GRAFANA_POD}" ]; then
+    echo "  ⚠ Grafana pod not found, skipping API import"
+    echo "  You can import manually via Grafana UI"
+else
+    # Extract dashboard JSON from ConfigMap
+    DASHBOARD_JSON_CONTENT=$(kubectl get configmap -n "${NAMESPACE}" fluss-flink-dashboard -o jsonpath='{.data.fluss-flink-dashboard\.json}')
+    
+    if [ -z "${DASHBOARD_JSON_CONTENT}" ]; then
+        echo "  ⚠ Could not extract dashboard JSON from ConfigMap"
+        echo "  You can import manually via Grafana UI"
+    else
+        # Prepare dashboard payload (ensure overwrite is set)
+        DASHBOARD_PAYLOAD=$(echo "${DASHBOARD_JSON_CONTENT}" | jq '. + {overwrite: true}' 2>/dev/null || echo "${DASHBOARD_JSON_CONTENT}")
+        
+        # Import via Grafana API
+        IMPORT_RESPONSE=$(kubectl exec -n "${NAMESPACE}" "${GRAFANA_POD}" -c grafana -- curl -s -X POST \
+            "http://localhost:3000/api/dashboards/db" \
+            -H "Content-Type: application/json" \
+            -u "${GRAFANA_USER}:${GRAFANA_PASS}" \
+            -d "${DASHBOARD_PAYLOAD}" 2>/dev/null || echo "")
+        
+        if echo "${IMPORT_RESPONSE}" | grep -q '"status":"success"'; then
+            DASHBOARD_UID=$(echo "${IMPORT_RESPONSE}" | jq -r '.uid // empty' 2>/dev/null || echo "")
+            DASHBOARD_URL=$(echo "${IMPORT_RESPONSE}" | jq -r '.url // empty' 2>/dev/null || echo "")
+            echo "  ✓ Dashboard imported successfully via Grafana API!"
+            if [ -n "${DASHBOARD_UID}" ]; then
+                echo "  Dashboard UID: ${DASHBOARD_UID}"
+            fi
+        else
+            ERROR_MSG=$(echo "${IMPORT_RESPONSE}" | jq -r '.message // .error // "Unknown error"' 2>/dev/null || echo "Unknown error")
+            echo "  ⚠ Dashboard import via API failed: ${ERROR_MSG}"
+            echo "  You can import manually via Grafana UI"
+        fi
+    fi
+fi
+
+echo ""
+echo "=== Deployment Complete ==="
+echo ""
+echo "Access Grafana:"
+echo "  kubectl port-forward -n ${NAMESPACE} svc/prometheus-grafana 3000:80"
+echo "  Then open: http://localhost:3000"
+echo "  Username: ${GRAFANA_USER}, Password: ${GRAFANA_PASS}"
+echo ""
+
diff --git a/e2e-iot/high-infra/k8s/monitoring/fluss-complete-dashboard.json b/e2e-iot/high-infra/k8s/monitoring/fluss-complete-dashboard.json
new file mode 100644
index 0000000..2a100ff
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/monitoring/fluss-complete-dashboard.json
@@ -0,0 +1,339 @@
+{
+  "dashboard": {
+    "title": "Fluss & Flink Complete Monitoring",
+    "tags": ["fluss", "flink", "producer", "metrics"],
+    "timezone": "browser",
+    "schemaVersion": 38,
+    "version": 0,
+    "refresh": "5s",
+    "time": {
+      "from": "now-15m",
+      "to": "now"
+    },
+    "panels": [
+      {
+        "id": 1,
+        "title": "Producer - Records Per Second",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "pointSize": 5
+            },
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(max(fluss_producer_records_per_second) by (pod))",
+            "legendFormat": "Total Records/sec",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 2,
+        "title": "Producer - Total Records",
+        "type": "stat",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(fluss_producer_records_total)",
+            "legendFormat": "Total Records",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 3,
+        "title": "Flink - Input Records Per Second",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(flink_taskmanager_job_task_operator_numRecordsInPerSecond) by (operator_name)",
+            "legendFormat": "{{operator_name}}",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 4,
+        "title": "Flink - Output Records Per Second",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(flink_taskmanager_job_task_operator_numRecordsOutPerSecond) by (operator_name)",
+            "legendFormat": "{{operator_name}}",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 5,
+        "title": "Flink - Total Input Records",
+        "type": "stat",
+        "gridPos": {"h": 4, "w": 6, "x": 0, "y": 16},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(flink_taskmanager_job_task_operator_numRecordsIn)",
+            "legendFormat": "Total Input",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 6,
+        "title": "Flink - Total Output Records",
+        "type": "stat",
+        "gridPos": {"h": 4, "w": 6, "x": 6, "y": 16},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(flink_taskmanager_job_task_operator_numRecordsOut)",
+            "legendFormat": "Total Output",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 7,
+        "title": "Flink - Custom Aggregator Records In",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_taskmanager_job_task_operator_fluss_aggregator_records_in",
+            "legendFormat": "Custom Input Counter",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 8,
+        "title": "Flink - Custom Aggregator Records Out",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_taskmanager_job_task_operator_fluss_aggregator_records_out",
+            "legendFormat": "Custom Output Counter",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 9,
+        "title": "Flink - Event Time Lag (ms)",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 28},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "ms"
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_taskmanager_job_task_operator_fluss_aggregator_event_time_lag_ms",
+            "legendFormat": "Custom Event Time Lag",
+            "refId": "A"
+          },
+          {
+            "expr": "flink_taskmanager_job_task_operator_currentFetchEventTimeLag",
+            "legendFormat": "Flink Event Time Lag",
+            "refId": "B"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 10,
+        "title": "Flink - Backpressure",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 28},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "ms"
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_taskmanager_job_task_backPressuredTimeMsPerSecond",
+            "legendFormat": "Backpressure",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 11,
+        "title": "Flink Cluster - Running Jobs",
+        "type": "stat",
+        "gridPos": {"h": 4, "w": 6, "x": 0, "y": 36},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_jobmanager_numRunningJobs",
+            "legendFormat": "Running Jobs",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 12,
+        "title": "Flink Cluster - TaskManagers",
+        "type": "stat",
+        "gridPos": {"h": 4, "w": 6, "x": 6, "y": 36},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_jobmanager_numRegisteredTaskManagers",
+            "legendFormat": "Task Managers",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 13,
+        "title": "Producer - Uptime",
+        "type": "stat",
+        "gridPos": {"h": 4, "w": 6, "x": 12, "y": 36},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "s"
+          }
+        },
+        "targets": [
+          {
+            "expr": "max(fluss_producer_uptime_seconds)",
+            "legendFormat": "Max Uptime",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 14,
+        "title": "Producer - Rate (5m avg)",
+        "type": "stat",
+        "gridPos": {"h": 4, "w": 6, "x": 18, "y": 36},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(max(rate(fluss_producer_records_total[5m])) by (pod))",
+            "legendFormat": "Records/sec (5m avg)",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      }
+    ]
+  },
+  "overwrite": true
+}
+
diff --git a/e2e-iot/high-infra/k8s/monitoring/fluss-flink-dashboard.json b/e2e-iot/high-infra/k8s/monitoring/fluss-flink-dashboard.json
new file mode 100644
index 0000000..fd43aef
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/monitoring/fluss-flink-dashboard.json
@@ -0,0 +1,982 @@
+{
+  "dashboard": {
+    "title": "Fluss & Flink Monitoring Dashboard",
+    "tags": ["fluss", "flink", "producer", "metrics"],
+    "timezone": "browser",
+    "schemaVersion": 38,
+    "version": 0,
+    "refresh": "5s",
+    "time": {
+      "from": "now-15m",
+      "to": "now"
+    },
+    "panels": [
+      {
+        "id": 1,
+        "title": "Producer - Records Per Second",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "smooth",
+              "lineWidth": 2,
+              "fillOpacity": 10,
+              "pointSize": 0,
+              "axisPlacement": "left",
+              "axisLabel": "Records/sec",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "axisSoftMax": null,
+              "axisDecimals": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            },
+            "unit": "ops"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(avg_over_time(fluss_producer_records_per_second[30s]))",
+            "legendFormat": "Total Records/sec",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 2,
+        "title": "Producer - Total Records",
+        "type": "stat",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(fluss_producer_records_total)",
+            "legendFormat": "Total Records",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 3,
+        "title": "Flink - Input Records Per Second",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            },
+            "unit": "ops"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(flink_taskmanager_job_task_operator_numRecordsInPerSecond{operator_name=\"FlussChangelogFilter\"})",
+            "legendFormat": "Input Records/sec (From Fluss - Before Filter)",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 4,
+        "title": "Flink - Output Records Per Second",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            },
+            "unit": "ops"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(flink_taskmanager_job_task_operator_numRecordsOutPerSecond{job=\"flink-taskmanager\"}) by (operator_name) or vector(0)",
+            "legendFormat": "{{operator_name}}",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 5,
+        "title": "Flink - Custom Aggregator Records In",
+        "type": "stat",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(flink_taskmanager_job_task_operator_fluss_aggregator_records_in{operator_name=\"FlussSensorReadingMapper\"}) or vector(0)",
+            "legendFormat": "Total Input Records",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 6,
+        "title": "Flink - Custom Aggregator Records Out",
+        "type": "stat",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(flink_taskmanager_job_task_operator_fluss_aggregator_records_out{task_name=\"FlussAggregatorSink\"}) or vector(0)",
+            "legendFormat": "Total Output Records",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 7,
+        "title": "Flink - Event Time Lag (ms)",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 24},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "ms",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_taskmanager_job_task_operator_fluss_aggregator_event_time_lag_ms",
+            "legendFormat": "Custom Event Time Lag",
+            "refId": "A"
+          },
+          {
+            "expr": "flink_taskmanager_job_task_operator_currentFetchEventTimeLag",
+            "legendFormat": "Flink Event Time Lag",
+            "refId": "B"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 8,
+        "title": "Flink - Backpressure",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 24},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "ms",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_taskmanager_job_task_backPressuredTimeMsPerSecond",
+            "legendFormat": "Backpressure",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 9,
+        "title": "Flink Cluster - Running Jobs",
+        "type": "stat",
+        "gridPos": {"h": 4, "w": 6, "x": 0, "y": 32},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_jobmanager_numRunningJobs",
+            "legendFormat": "Running Jobs",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 10,
+        "title": "Flink Cluster - TaskManagers",
+        "type": "stat",
+        "gridPos": {"h": 4, "w": 6, "x": 6, "y": 32},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_jobmanager_numRegisteredTaskManagers",
+            "legendFormat": "Task Managers",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 11,
+        "title": "Flink Cluster - Available Task Slots",
+        "type": "stat",
+        "gridPos": {"h": 4, "w": 6, "x": 12, "y": 32},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_jobmanager_taskSlotsAvailable",
+            "legendFormat": "Available Slots",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 12,
+        "title": "Flink - Total Input Records",
+        "type": "stat",
+        "gridPos": {"h": 4, "w": 6, "x": 18, "y": 32},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(flink_taskmanager_job_task_operator_fluss_aggregator_records_in{operator_name=\"FlussSensorReadingMapper\"}) or vector(0)",
+            "legendFormat": "Custom Aggregator Input",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 13,
+        "title": "Flink - Total Output Records",
+        "type": "stat",
+        "gridPos": {"h": 4, "w": 6, "x": 0, "y": 36},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(flink_taskmanager_job_task_operator_fluss_aggregator_records_out{task_name=\"FlussAggregatorSink\"}) or vector(0)",
+            "legendFormat": "Custom Aggregator Output",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 14,
+        "title": "Flink JobManager - JVM Heap Used",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 40},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "bytes",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_jobmanager_Status_JVM_Memory_Heap_Used",
+            "legendFormat": "Heap Used",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 15,
+        "title": "Flink TaskManager - JVM Heap Used",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 40},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "bytes",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "flink_taskmanager_Status_JVM_Memory_Heap_Used",
+            "legendFormat": "{{tm_id}}",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 16,
+        "title": "Fluss Coordinator - Active Tablet Servers",
+        "type": "stat",
+        "gridPos": {"h": 8, "w": 6, "x": 0, "y": 48},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "fluss_coordinator_activeTabletServerCount",
+            "legendFormat": "Active Tablet Servers",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 17,
+        "title": "Fluss Coordinator - Table Count",
+        "type": "stat",
+        "gridPos": {"h": 8, "w": 6, "x": 6, "y": 48},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "fluss_coordinator_tableCount",
+            "legendFormat": "Tables",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 18,
+        "title": "Fluss Coordinator - Bucket Count",
+        "type": "stat",
+        "gridPos": {"h": 8, "w": 6, "x": 12, "y": 48},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "fluss_coordinator_bucketCount",
+            "legendFormat": "Buckets",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 19,
+        "title": "Fluss Coordinator - Partition Count",
+        "type": "stat",
+        "gridPos": {"h": 8, "w": 6, "x": 18, "y": 48},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "thresholds"},
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "fluss_coordinator_partitionCount",
+            "legendFormat": "Partitions",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 20,
+        "title": "Fluss Coordinator - Event Queue Size",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 56},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "short"
+          }
+        },
+        "targets": [
+          {
+            "expr": "fluss_coordinator_eventQueueSize",
+            "legendFormat": "Event Queue Size",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 21,
+        "title": "Fluss Coordinator - Event Processing Time (p95)",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 56},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "ms",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.95, sum(rate(fluss_coordinator_eventProcessingTimeMs[1m])) by (le, host))",
+            "legendFormat": "p95 Event Processing Time - {{host}}",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 22,
+        "title": "Fluss Tablet Server - Messages In Rate",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 64},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            },
+            "unit": "ops"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(fluss_tabletserver_messagesInPerSecond) by (host)",
+            "legendFormat": "{{host}}",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 23,
+        "title": "Fluss Tablet Server - Bytes In/Out Rate",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 64},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "Bps",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(fluss_tabletserver_bytesInPerSecond) by (host)",
+            "legendFormat": "Bytes In - {{host}}",
+            "refId": "A"
+          },
+          {
+            "expr": "sum(fluss_tabletserver_bytesOutPerSecond) by (host)",
+            "legendFormat": "Bytes Out - {{host}}",
+            "refId": "B"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 24,
+        "title": "Fluss Tablet Server - Replication Rates",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 72},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "Bps",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(fluss_tabletserver_replicationBytesInPerSecond) by (host)",
+            "legendFormat": "Replication In - {{host}}",
+            "refId": "A"
+          },
+          {
+            "expr": "sum(fluss_tabletserver_replicationBytesOutPerSecond) by (host)",
+            "legendFormat": "Replication Out - {{host}}",
+            "refId": "B"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 25,
+        "title": "Fluss Tablet Server - Request Rates by Type",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 72},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            },
+            "unit": "ops"
+          }
+        },
+        "targets": [
+          {
+            "expr": "sum(fluss_tabletserver_request_requestsPerSecond{request=\"produceLog\"}) by (host)",
+            "legendFormat": "ProduceLog - {{host}}",
+            "refId": "A"
+          },
+          {
+            "expr": "sum(fluss_tabletserver_request_requestsPerSecond{request=\"fetchLogClient\"}) by (host)",
+            "legendFormat": "FetchLog Client - {{host}}",
+            "refId": "B"
+          },
+          {
+            "expr": "sum(fluss_tabletserver_request_requestsPerSecond{request=\"putKv\"}) by (host)",
+            "legendFormat": "PutKV - {{host}}",
+            "refId": "C"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 26,
+        "title": "Fluss Coordinator - JVM Heap Memory",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 80},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "bytes",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "fluss_coordinator_status_JVM_memory_heap_used",
+            "legendFormat": "Heap Used - {{host}}",
+            "refId": "A"
+          },
+          {
+            "expr": "fluss_coordinator_status_JVM_memory_heap_max",
+            "legendFormat": "Heap Max - {{host}}",
+            "refId": "B"
+          },
+          {
+            "expr": "fluss_coordinator_status_JVM_memory_heap_committed",
+            "legendFormat": "Heap Committed - {{host}}",
+            "refId": "C"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 27,
+        "title": "Fluss Coordinator - CPU Usage",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 80},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "percentunit",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "fluss_coordinator_status_JVM_CPU_load * 100",
+            "legendFormat": "CPU Load % - {{host}}",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 28,
+        "title": "Fluss Tablet Server - JVM Heap Memory",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 88},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "bytes",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "fluss_tabletserver_status_JVM_memory_heap_used",
+            "legendFormat": "Heap Used - {{host}}",
+            "refId": "A"
+          },
+          {
+            "expr": "fluss_tabletserver_status_JVM_memory_heap_max",
+            "legendFormat": "Heap Max - {{host}}",
+            "refId": "B"
+          },
+          {
+            "expr": "fluss_tabletserver_status_JVM_memory_heap_committed",
+            "legendFormat": "Heap Committed - {{host}}",
+            "refId": "C"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 29,
+        "title": "Fluss Tablet Server - CPU Usage",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 88},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "percentunit",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "fluss_tabletserver_status_JVM_CPU_load * 100",
+            "legendFormat": "CPU Load % - {{host}}",
+            "refId": "A"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 30,
+        "title": "Fluss Coordinator - JVM Non-Heap Memory",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 96},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "bytes",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "fluss_coordinator_status_JVM_memory_nonHeap_used",
+            "legendFormat": "Non-Heap Used - {{host}}",
+            "refId": "A"
+          },
+          {
+            "expr": "fluss_coordinator_status_JVM_memory_nonHeap_max",
+            "legendFormat": "Non-Heap Max - {{host}}",
+            "refId": "B"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      },
+      {
+        "id": 31,
+        "title": "Fluss Tablet Server - JVM Non-Heap Memory",
+        "type": "timeseries",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 96},
+        "fieldConfig": {
+          "defaults": {
+            "color": {"mode": "palette-classic"},
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10
+            },
+            "unit": "bytes",
+            "custom": {
+              "drawStyle": "line",
+              "lineInterpolation": "linear",
+              "fillOpacity": 10,
+              "axisPlacement": "auto",
+              "axisLabel": "",
+              "axisColorMode": "text",
+              "scaleDistribution": {"type": "linear"},
+              "axisCenteredZero": false,
+              "axisSoftMin": 0,
+              "hideFrom": {"tooltip": false, "viz": false, "legend": false}
+            }
+          }
+        },
+        "targets": [
+          {
+            "expr": "fluss_tabletserver_status_JVM_memory_nonHeap_used",
+            "legendFormat": "Non-Heap Used - {{host}}",
+            "refId": "A"
+          },
+          {
+            "expr": "fluss_tabletserver_status_JVM_memory_nonHeap_max",
+            "legendFormat": "Non-Heap Max - {{host}}",
+            "refId": "B"
+          }
+        ],
+        "datasource": {"type": "prometheus", "uid": "prometheus"}
+      }
+    ]
+  },
+  "overwrite": true
+}
diff --git a/e2e-iot/high-infra/k8s/monitoring/grafana-dashboard.yaml b/e2e-iot/high-infra/k8s/monitoring/grafana-dashboard.yaml
new file mode 100644
index 0000000..08fbb37
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/monitoring/grafana-dashboard.yaml
@@ -0,0 +1,817 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: v1
+data:
+  fluss-flink-dashboard.json: |
+    {
+      "dashboard": {
+        "title": "Fluss & Flink Monitoring Dashboard",
+        "tags": ["fluss", "flink", "producer", "metrics"],
+        "timezone": "browser",
+        "schemaVersion": 38,
+        "version": 0,
+        "refresh": "5s",
+        "time": {
+          "from": "now-15m",
+          "to": "now"
+        },
+        "panels": [
+          {
+            "id": 1,
+            "title": "Producer - Records Per Second",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "smooth",
+                  "fillOpacity": 10,
+                  "lineWidth": 2
+                },
+                "unit": "ops",
+                "decimals": 0,
+                "min": 0
+              }
+            },
+            "targets": [
+              {
+                "expr": "sum(max(rate(fluss_producer_records_total[5m])) by (pod))",
+                "legendFormat": "Total Records/sec",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 2,
+            "title": "Producer - Total Records",
+            "type": "stat",
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "thresholds"},
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "sum(fluss_producer_records_total)",
+                "legendFormat": "Total Records",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 3,
+            "title": "Flink - Input Records Per Second",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "ops"
+              }
+            },
+            "targets": [
+              {
+                "expr": "sum(flink_taskmanager_job_task_operator_numRecordsInPerSecond{operator_name=\"FlussChangelogFilter\",job=\"flink-taskmanager\"})",
+                "legendFormat": "Input Records/sec (From Fluss - Before Filter)",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 4,
+            "title": "Flink - Output Records Per Second",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "ops"
+              }
+            },
+            "targets": [
+              {
+                "expr": "sum(flink_taskmanager_job_task_operator_numRecordsOutPerSecond{job=\"flink-taskmanager\"}) by (operator_name) or vector(0)",
+                "legendFormat": "{{operator_name}}",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 5,
+            "title": "Flink - Custom Aggregator Records In",
+            "type": "stat",
+            "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "thresholds"},
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "sum(flink_taskmanager_job_task_operator_fluss_aggregator_records_in{job=\"flink-taskmanager\"}) or vector(0)",
+                "legendFormat": "Total Input Records",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 6,
+            "title": "Flink - Custom Aggregator Records Out",
+            "type": "stat",
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "thresholds"},
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "sum(flink_taskmanager_job_task_operator_fluss_aggregator_records_out{job=\"flink-taskmanager\"}) or vector(0)",
+                "legendFormat": "Total Output Records",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 7,
+            "title": "Flink - Event Time Lag (ms)",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 0, "y": 24},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "ms"
+              }
+            },
+            "targets": [
+              {
+                "expr": "flink_taskmanager_job_task_operator_fluss_aggregator_event_time_lag_ms",
+                "legendFormat": "Custom Event Time Lag",
+                "refId": "A"
+              },
+              {
+                "expr": "flink_taskmanager_job_task_operator_currentFetchEventTimeLag",
+                "legendFormat": "Flink Event Time Lag",
+                "refId": "B"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 8,
+            "title": "Flink - Backpressure",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 24},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "ms"
+              }
+            },
+            "targets": [
+              {
+                "expr": "flink_taskmanager_job_task_backPressuredTimeMsPerSecond",
+                "legendFormat": "Backpressure",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 9,
+            "title": "Flink Cluster - Running Jobs",
+            "type": "stat",
+            "gridPos": {"h": 4, "w": 6, "x": 0, "y": 32},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "thresholds"},
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "flink_jobmanager_numRunningJobs",
+                "legendFormat": "Running Jobs",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 10,
+            "title": "Flink Cluster - TaskManagers",
+            "type": "stat",
+            "gridPos": {"h": 4, "w": 6, "x": 6, "y": 32},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "thresholds"},
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "flink_jobmanager_numRegisteredTaskManagers",
+                "legendFormat": "Task Managers",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 11,
+            "title": "Flink Cluster - Available Task Slots",
+            "type": "stat",
+            "gridPos": {"h": 4, "w": 6, "x": 12, "y": 32},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "thresholds"},
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "flink_jobmanager_taskSlotsAvailable",
+                "legendFormat": "Available Slots",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 12,
+            "title": "Flink - Total Input Records",
+            "type": "stat",
+            "gridPos": {"h": 4, "w": 6, "x": 18, "y": 32},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "thresholds"},
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "sum(flink_taskmanager_job_task_operator_fluss_aggregator_records_in{job=\"flink-taskmanager\"}) or vector(0)",
+                "legendFormat": "Custom Aggregator Input",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 13,
+            "title": "Flink - Total Output Records",
+            "type": "stat",
+            "gridPos": {"h": 4, "w": 6, "x": 0, "y": 36},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "thresholds"},
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "sum(flink_taskmanager_job_task_operator_fluss_aggregator_records_out{job=\"flink-taskmanager\"}) or vector(0)",
+                "legendFormat": "Custom Aggregator Output",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 14,
+            "title": "Flink JobManager - JVM Heap Used",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 0, "y": 40},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "bytes"
+              }
+            },
+            "targets": [
+              {
+                "expr": "flink_jobmanager_Status_JVM_Memory_Heap_Used",
+                "legendFormat": "Heap Used",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 15,
+            "title": "Flink TaskManager - JVM Heap Used",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 40},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "bytes"
+              }
+            },
+            "targets": [
+              {
+                "expr": "flink_taskmanager_Status_JVM_Memory_Heap_Used",
+                "legendFormat": "{{tm_id}}",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 16,
+            "title": "Fluss Coordinator - Active Tablet Servers",
+            "type": "stat",
+            "gridPos": {"h": 8, "w": 6, "x": 0, "y": 48},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "thresholds"},
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "fluss_coordinator_activeTabletServerCount",
+                "legendFormat": "Active Tablet Servers",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 17,
+            "title": "Fluss Coordinator - Table Count",
+            "type": "stat",
+            "gridPos": {"h": 8, "w": 6, "x": 6, "y": 48},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "thresholds"},
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "fluss_coordinator_tableCount",
+                "legendFormat": "Tables",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 18,
+            "title": "Fluss Coordinator - Bucket Count",
+            "type": "stat",
+            "gridPos": {"h": 8, "w": 6, "x": 12, "y": 48},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "thresholds"},
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "fluss_coordinator_bucketCount",
+                "legendFormat": "Buckets",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 19,
+            "title": "Fluss Coordinator - Partition Count",
+            "type": "stat",
+            "gridPos": {"h": 8, "w": 6, "x": 18, "y": 48},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "thresholds"},
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "fluss_coordinator_partitionCount",
+                "legendFormat": "Partitions",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 20,
+            "title": "Fluss Coordinator - Event Queue Size",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 0, "y": 56},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "short"
+              }
+            },
+            "targets": [
+              {
+                "expr": "fluss_coordinator_eventQueueSize",
+                "legendFormat": "Event Queue Size",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 21,
+            "title": "Fluss Coordinator - Event Processing Time (p95)",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 56},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "ms"
+              }
+            },
+            "targets": [
+              {
+                "expr": "histogram_quantile(0.95, sum(rate(fluss_coordinator_eventProcessingTimeMs[1m])) by (le, host))",
+                "legendFormat": "p95 Event Processing Time - {{host}}",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 22,
+            "title": "Fluss Tablet Server - Messages In Rate",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 0, "y": 64},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "ops"
+              }
+            },
+            "targets": [
+              {
+                "expr": "sum(fluss_tabletserver_messagesInPerSecond) by (host)",
+                "legendFormat": "{{host}}",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 23,
+            "title": "Fluss Tablet Server - Bytes In/Out Rate",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 64},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "Bps"
+              }
+            },
+            "targets": [
+              {
+                "expr": "sum(fluss_tabletserver_bytesInPerSecond) by (host)",
+                "legendFormat": "Bytes In - {{host}}",
+                "refId": "A"
+              },
+              {
+                "expr": "sum(fluss_tabletserver_bytesOutPerSecond) by (host)",
+                "legendFormat": "Bytes Out - {{host}}",
+                "refId": "B"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 24,
+            "title": "Fluss Tablet Server - Replication Rates",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 0, "y": 72},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "Bps"
+              }
+            },
+            "targets": [
+              {
+                "expr": "sum(fluss_tabletserver_replicationBytesInPerSecond) by (host)",
+                "legendFormat": "Replication In - {{host}}",
+                "refId": "A"
+              },
+              {
+                "expr": "sum(fluss_tabletserver_replicationBytesOutPerSecond) by (host)",
+                "legendFormat": "Replication Out - {{host}}",
+                "refId": "B"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 25,
+            "title": "Fluss Tablet Server - Request Rates by Type",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 72},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "ops"
+              }
+            },
+            "targets": [
+              {
+                "expr": "sum(fluss_tabletserver_request_requestsPerSecond{request=\"produceLog\"}) by (host)",
+                "legendFormat": "ProduceLog - {{host}}",
+                "refId": "A"
+              },
+              {
+                "expr": "sum(fluss_tabletserver_request_requestsPerSecond{request=\"fetchLogClient\"}) by (host)",
+                "legendFormat": "FetchLog Client - {{host}}",
+                "refId": "B"
+              },
+              {
+                "expr": "sum(fluss_tabletserver_request_requestsPerSecond{request=\"putKv\"}) by (host)",
+                "legendFormat": "PutKV - {{host}}",
+                "refId": "C"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 26,
+            "title": "Fluss Coordinator - JVM Heap Memory",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 0, "y": 80},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "bytes"
+              }
+            },
+            "targets": [
+              {
+                "expr": "fluss_coordinator_status_JVM_memory_heap_used",
+                "legendFormat": "Heap Used - {{host}}",
+                "refId": "A"
+              },
+              {
+                "expr": "fluss_coordinator_status_JVM_memory_heap_max",
+                "legendFormat": "Heap Max - {{host}}",
+                "refId": "B"
+              },
+              {
+                "expr": "fluss_coordinator_status_JVM_memory_heap_committed",
+                "legendFormat": "Heap Committed - {{host}}",
+                "refId": "C"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 27,
+            "title": "Fluss Coordinator - CPU Usage",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 80},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "percentunit"
+              }
+            },
+            "targets": [
+              {
+                "expr": "fluss_coordinator_status_JVM_CPU_load * 100",
+                "legendFormat": "CPU Load % - {{host}}",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 28,
+            "title": "Fluss Tablet Server - JVM Heap Memory",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 0, "y": 88},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "bytes"
+              }
+            },
+            "targets": [
+              {
+                "expr": "fluss_tabletserver_status_JVM_memory_heap_used",
+                "legendFormat": "Heap Used - {{host}}",
+                "refId": "A"
+              },
+              {
+                "expr": "fluss_tabletserver_status_JVM_memory_heap_max",
+                "legendFormat": "Heap Max - {{host}}",
+                "refId": "B"
+              },
+              {
+                "expr": "fluss_tabletserver_status_JVM_memory_heap_committed",
+                "legendFormat": "Heap Committed - {{host}}",
+                "refId": "C"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 29,
+            "title": "Fluss Tablet Server - CPU Usage",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 88},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "percentunit"
+              }
+            },
+            "targets": [
+              {
+                "expr": "fluss_tabletserver_status_JVM_CPU_load * 100",
+                "legendFormat": "CPU Load % - {{host}}",
+                "refId": "A"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 30,
+            "title": "Fluss Coordinator - JVM Non-Heap Memory",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 0, "y": 96},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "bytes"
+              }
+            },
+            "targets": [
+              {
+                "expr": "fluss_coordinator_status_JVM_memory_nonHeap_used",
+                "legendFormat": "Non-Heap Used - {{host}}",
+                "refId": "A"
+              },
+              {
+                "expr": "fluss_coordinator_status_JVM_memory_nonHeap_max",
+                "legendFormat": "Non-Heap Max - {{host}}",
+                "refId": "B"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          },
+          {
+            "id": 31,
+            "title": "Fluss Tablet Server - JVM Non-Heap Memory",
+            "type": "timeseries",
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 96},
+            "fieldConfig": {
+              "defaults": {
+                "color": {"mode": "palette-classic"},
+                "custom": {
+                  "drawStyle": "line",
+                  "lineInterpolation": "linear",
+                  "fillOpacity": 10
+                },
+                "unit": "bytes"
+              }
+            },
+            "targets": [
+              {
+                "expr": "fluss_tabletserver_status_JVM_memory_nonHeap_used",
+                "legendFormat": "Non-Heap Used - {{host}}",
+                "refId": "A"
+              },
+              {
+                "expr": "fluss_tabletserver_status_JVM_memory_nonHeap_max",
+                "legendFormat": "Non-Heap Max - {{host}}",
+                "refId": "B"
+              }
+            ],
+            "datasource": {"type": "prometheus", "uid": "prometheus"}
+          }
+        ]
+      },
+      "overwrite": true
+    }
+kind: ConfigMap
+metadata:
+  creationTimestamp: null
+  labels:
+    grafana_dashboard: "1"
+  name: fluss-flink-dashboard
+  namespace: monitoring
diff --git a/e2e-iot/high-infra/k8s/monitoring/podmonitors.yaml b/e2e-iot/high-infra/k8s/monitoring/podmonitors.yaml
new file mode 100644
index 0000000..95700a5
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/monitoring/podmonitors.yaml
@@ -0,0 +1,72 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# PodMonitor for Producer pods (if ServiceMonitor doesn't work)
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: fluss-producer-pods
+  namespace: fluss
+  labels:
+    app: fluss-producer
+spec:
+  selector:
+    matchLabels:
+      app: fluss-producer
+  podMetricsEndpoints:
+    - port: metrics
+      path: /metrics
+      interval: 30s
+---
+# PodMonitor for Flink JobManager pods
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: flink-jobmanager-pods
+  namespace: fluss
+  labels:
+    app: flink
+    component: jobmanager
+spec:
+  selector:
+    matchLabels:
+      app: flink
+      component: jobmanager
+  podMetricsEndpoints:
+    - port: metrics
+      path: /metrics
+      interval: 30s
+---
+# PodMonitor for Flink TaskManager pods
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: flink-taskmanager-pods
+  namespace: fluss
+  labels:
+    app: flink
+    component: taskmanager
+spec:
+  selector:
+    matchLabels:
+      app: flink
+      component: taskmanager
+  podMetricsEndpoints:
+    - port: metrics
+      path: /metrics
+      interval: 30s
+
diff --git a/e2e-iot/high-infra/k8s/monitoring/servicemonitors.yaml b/e2e-iot/high-infra/k8s/monitoring/servicemonitors.yaml
new file mode 100644
index 0000000..98a0e39
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/monitoring/servicemonitors.yaml
@@ -0,0 +1,110 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# ServiceMonitor for Producer metrics
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: fluss-producer-metrics
+  namespace: fluss
+  labels:
+    app: fluss-producer
+spec:
+  selector:
+    matchLabels:
+      app: fluss-producer
+  endpoints:
+    - port: metrics
+      path: /metrics
+      interval: 30s
+---
+# ServiceMonitor for Flink JobManager metrics
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: flink-jobmanager-metrics
+  namespace: fluss
+  labels:
+    app: flink
+    component: jobmanager
+spec:
+  selector:
+    matchLabels:
+      app: flink
+      component: jobmanager
+  endpoints:
+    - port: metrics
+      path: /metrics
+      interval: 30s
+---
+# ServiceMonitor for Flink TaskManager metrics
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: flink-taskmanager-metrics
+  namespace: fluss
+  labels:
+    app: flink
+    component: taskmanager
+spec:
+  selector:
+    matchLabels:
+      app: flink
+      component: taskmanager
+  endpoints:
+    - port: metrics
+      path: /metrics
+      interval: 30s
+---
+# ServiceMonitor for Fluss Coordinator metrics
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: fluss-coordinator-metrics
+  namespace: fluss
+  labels:
+    app: fluss
+    component: coordinator
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: fluss
+      app.kubernetes.io/component: coordinator
+  endpoints:
+    - port: metrics
+      path: /metrics
+      interval: 30s
+---
+# ServiceMonitor for Fluss Tablet Server metrics
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: fluss-tablet-metrics
+  namespace: fluss
+  labels:
+    app: fluss
+    component: tablet-server
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: fluss
+      app.kubernetes.io/component: tablet
+  endpoints:
+    - port: metrics
+      path: /metrics
+      interval: 30s
+
diff --git a/e2e-iot/high-infra/k8s/namespace/namespace.yaml b/e2e-iot/high-infra/k8s/namespace/namespace.yaml
new file mode 100644
index 0000000..91bcbfa
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/namespace/namespace.yaml
@@ -0,0 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: fluss
+  labels:
+    name: fluss
+
diff --git a/e2e-iot/high-infra/k8s/scripts/00-deploy-infra.sh b/e2e-iot/high-infra/k8s/scripts/00-deploy-infra.sh
new file mode 100755
index 0000000..3e63779
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/00-deploy-infra.sh
@@ -0,0 +1,651 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+TERRAFORM_DIR="${K8S_DIR}/../terraform"
+
+REGION="${REGION:-us-west-2}"
+CLUSTER_NAME="${CLUSTER_NAME:-fluss-eks-cluster}"
+
+echo "=== Step 0: Deploying infrastructure with Terraform ==="
+echo "Region: ${REGION}"
+echo "Cluster Name: ${CLUSTER_NAME}"
+echo ""
+
+# Check prerequisites for entire deployment process
+echo "=========================================="
+echo "Checking Prerequisites for Deployment"
+echo "=========================================="
+echo ""
+
+ERRORS=0
+WARNINGS=0
+
+# Check Terraform is available
+echo "[1/7] Checking Terraform..."
+if ! command -v terraform &> /dev/null; then
+    echo "  ✗ ERROR: Terraform is not installed or not in PATH"
+    echo "    Install Terraform: https://www.terraform.io/downloads"
+    ERRORS=$((ERRORS + 1))
+else
+    TERRAFORM_VERSION=$(terraform version 2>&1 | head -n 1 | sed 's/.*v\([0-9.]*\).*/\1/' 2>/dev/null || echo "unknown")
+    echo "  ✓ Terraform version: ${TERRAFORM_VERSION}"
+    
+    # Check Terraform version (>= 1.0)
+    if [ "${TERRAFORM_VERSION}" != "unknown" ]; then
+        TERRAFORM_MAJOR=$(echo "${TERRAFORM_VERSION}" | cut -d'.' -f1 2>/dev/null || echo "0")
+        if [ -n "${TERRAFORM_MAJOR}" ] && [ "${TERRAFORM_MAJOR}" -lt 1 ] 2>/dev/null; then
+            echo "  ⚠ WARNING: Terraform version should be >= 1.0"
+            WARNINGS=$((WARNINGS + 1))
+        fi
+    fi
+fi
+
+# Check AWS CLI is available
+echo ""
+echo "[2/7] Checking AWS CLI..."
+if ! command -v aws &> /dev/null; then
+    echo "  ✗ ERROR: AWS CLI is not installed or not in PATH"
+    echo "    Install AWS CLI: https://aws.amazon.com/cli/"
+    ERRORS=$((ERRORS + 1))
+else
+    AWS_VERSION=$(aws --version 2>&1 | head -n 1)
+    echo "  ✓ ${AWS_VERSION}"
+fi
+
+# Check AWS credentials are configured
+echo ""
+echo "[3/7] Checking AWS credentials..."
+if ! aws sts get-caller-identity &> /dev/null; then
+    echo "  ✗ ERROR: AWS credentials are not configured"
+    echo "    Configure AWS credentials: aws configure"
+    echo "    Or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables"
+    ERRORS=$((ERRORS + 1))
+else
+    AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo "unknown")
+    AWS_USER=$(aws sts get-caller-identity --query Arn --output text 2>/dev/null || echo "unknown")
+    echo "  ✓ AWS Account: ${AWS_ACCOUNT}"
+    echo "  ✓ AWS User: ${AWS_USER}"
+fi
+
+# Check kubectl is available (needed for later steps)
+echo ""
+echo "[4/7] Checking kubectl..."
+if ! command -v kubectl &> /dev/null; then
+    echo "  ✗ ERROR: kubectl is not installed or not in PATH"
+    echo "    Install kubectl: https://kubernetes.io/docs/tasks/tools/"
+    ERRORS=$((ERRORS + 1))
+else
+    KUBECTL_VERSION=$(kubectl version --client 2>&1 | head -n 1 | sed 's/.*v\([0-9.]*\).*/\1/' 2>/dev/null || echo "unknown")
+    echo "  ✓ kubectl version: ${KUBECTL_VERSION}"
+fi
+
+# Check helm is available (needed for later steps)
+echo ""
+echo "[5/7] Checking helm..."
+if ! command -v helm &> /dev/null; then
+    echo "  ✗ ERROR: helm is not installed or not in PATH"
+    echo "    Install helm: https://helm.sh/docs/intro/install/"
+    ERRORS=$((ERRORS + 1))
+else
+    # Helm v3 uses --short, v2 uses different format
+    HELM_VERSION=$(helm version --short 2>&1 | head -n 1 || helm version --client --short 2>&1 | head -n 1 || helm version 2>&1 | head -n 1 | sed 's/.*v\([0-9.]*\).*/\1/' || echo "unknown")
+    echo "  ✓ helm version: ${HELM_VERSION}"
+fi
+
+# Check Docker (optional but recommended for image building)
+echo ""
+echo "[6/7] Checking Docker (optional)..."
+if ! command -v docker &> /dev/null; then
+    echo "  ⚠ WARNING: Docker is not installed (needed if building images locally)"
+    echo "    Install Docker: https://docs.docker.com/get-docker/"
+    WARNINGS=$((WARNINGS + 1))
+else
+    DOCKER_VERSION=$(docker --version 2>&1 | head -n 1 || echo "unknown")
+    echo "  ✓ ${DOCKER_VERSION}"
+fi
+
+# Check Terraform directory and configuration
+echo ""
+echo "[7/7] Checking Terraform configuration..."
+if [ ! -d "${TERRAFORM_DIR}" ]; then
+    echo "  ✗ ERROR: Terraform directory not found at ${TERRAFORM_DIR}"
+    ERRORS=$((ERRORS + 1))
+else
+    echo "  ✓ Terraform directory exists"
+    
+    # Check terraform.tfvars
+    if [ ! -f "${TERRAFORM_DIR}/terraform.tfvars" ]; then
+        echo "  ⚠ WARNING: terraform.tfvars not found"
+        if [ -f "${TERRAFORM_DIR}/terraform.tfvars.example" ]; then
+            echo "    Will create from terraform.tfvars.example"
+            WARNINGS=$((WARNINGS + 1))
+        else
+            echo "  ✗ ERROR: terraform.tfvars.example not found"
+            ERRORS=$((ERRORS + 1))
+        fi
+    else
+        echo "  ✓ terraform.tfvars exists"
+    fi
+    
+    # Check main Terraform files exist
+    if [ ! -f "${TERRAFORM_DIR}/main.tf" ]; then
+        echo "  ✗ ERROR: main.tf not found in Terraform directory"
+        ERRORS=$((ERRORS + 1))
+    else
+        echo "  ✓ main.tf exists"
+    fi
+fi
+
+# Summary
+echo ""
+echo "=========================================="
+echo "Prerequisites Check Summary"
+echo "=========================================="
+echo "Errors: ${ERRORS}"
+echo "Warnings: ${WARNINGS}"
+echo ""
+
+if [ ${ERRORS} -gt 0 ]; then
+    echo "✗ Prerequisites check FAILED"
+    echo ""
+    echo "Please fix the errors above before proceeding."
+    echo "The deployment cannot continue with missing prerequisites."
+    exit 1
+fi
+
+if [ ${WARNINGS} -gt 0 ]; then
+    echo "⚠ Prerequisites check passed with warnings"
+    echo ""
+    echo "You may proceed, but some features may not work correctly."
+    echo "Review the warnings above."
+    echo ""
+    read -p "Do you want to continue despite warnings? (yes/no): " CONTINUE
+    if [ "${CONTINUE}" != "yes" ]; then
+        echo "Deployment cancelled by user"
+        exit 0
+    fi
+    echo ""
+else
+    echo "✓ All prerequisites check passed"
+    echo ""
+fi
+
+# Check Terraform directory exists
+if [ ! -d "${TERRAFORM_DIR}" ]; then
+    echo "ERROR: Terraform directory not found at ${TERRAFORM_DIR}"
+    exit 1
+fi
+
+# Check terraform.tfvars exists
+if [ ! -f "${TERRAFORM_DIR}/terraform.tfvars" ]; then
+    echo ""
+    echo "WARNING: terraform.tfvars not found"
+    echo "Creating terraform.tfvars from example..."
+    if [ -f "${TERRAFORM_DIR}/terraform.tfvars.example" ]; then
+        cp "${TERRAFORM_DIR}/terraform.tfvars.example" "${TERRAFORM_DIR}/terraform.tfvars"
+        echo "✓ Created terraform.tfvars from example"
+        echo ""
+        echo "⚠ IMPORTANT: Please edit ${TERRAFORM_DIR}/terraform.tfvars with your values before continuing"
+        echo "  Required variables:"
+        echo "    - fluss_image_repository"
+        echo "    - demo_image_repository"
+        echo ""
+        read -p "Press Enter after updating terraform.tfvars to continue, or Ctrl+C to abort..."
+    else
+        echo "ERROR: terraform.tfvars.example not found"
+        exit 1
+    fi
+fi
+
+echo "✓ terraform.tfvars found"
+
+# Navigate to Terraform directory
+cd "${TERRAFORM_DIR}"
+
+# Check Terraform state configuration and preservation
+echo ""
+echo "=========================================="
+echo "Checking Terraform State Configuration"
+echo "=========================================="
+
+# Check if backend is configured in main.tf (before init)
+# Look for backend block (can be commented or uncommented)
+if [ -f "${TERRAFORM_DIR}/main.tf" ]; then
+    BACKEND_IN_CONFIG=$(grep -E "^\s*backend\s+" "${TERRAFORM_DIR}/main.tf" 2>/dev/null | grep -v "^\s*#" || echo "")
+else
+    BACKEND_IN_CONFIG=""
+fi
+USE_LOCAL_STATE=true
+
+if [ -z "${BACKEND_IN_CONFIG}" ]; then
+    echo "⚠ WARNING: No remote backend configured in main.tf"
+    echo "  Terraform state will be stored locally in: ${TERRAFORM_DIR}/terraform.tfstate"
+    echo "  This state file is critical - if lost, Terraform cannot manage existing infrastructure"
+    echo ""
+    echo "  To preserve state, consider:"
+    echo "    1. Configure S3 backend in main.tf (uncomment backend block)"
+    echo "    2. Or backup terraform.tfstate file regularly"
+    echo ""
+    
+    # Backup existing state file if it exists
+    if [ -f "${TERRAFORM_DIR}/terraform.tfstate" ]; then
+        BACKUP_FILE="${TERRAFORM_DIR}/terraform.tfstate.backup.$(date +%Y%m%d_%H%M%S)"
+        echo "  Creating backup of existing state file..."
+        cp "${TERRAFORM_DIR}/terraform.tfstate" "${BACKUP_FILE}"
+        echo "  ✓ State backed up to: ${BACKUP_FILE}"
+        
+        # Also backup .terraform directory if it exists (contains provider plugins and backend config)
+        if [ -d "${TERRAFORM_DIR}/.terraform" ]; then
+            echo "  Note: .terraform directory exists (contains provider plugins)"
+        fi
+    else
+        echo "  ℹ No existing state file found (first-time deployment)"
+    fi
+    echo ""
+else
+    echo "✓ Remote backend configured in main.tf"
+    echo "  State will be stored remotely"
+    USE_LOCAL_STATE=false
+    echo ""
+fi
+
+# Initialize Terraform
+echo ""
+echo "Initializing Terraform..."
+if terraform init; then
+    echo "✓ Terraform initialized"
+    
+    # After init, check actual backend status by looking at .terraform/terraform.tfstate
+    # If backend is actually configured, this file will exist and contain backend info
+    if [ -f "${TERRAFORM_DIR}/.terraform/terraform.tfstate" ]; then
+        if grep -q "backend" "${TERRAFORM_DIR}/.terraform/terraform.tfstate" 2>/dev/null; then
+            USE_LOCAL_STATE=false
+            echo "  ✓ Using remote backend (verified after init)"
+        fi
+    fi
+    
+    # Final check: if local state file exists but backend is configured, warn
+    if [ "${USE_LOCAL_STATE}" = "false" ] && [ -f "${TERRAFORM_DIR}/terraform.tfstate" ]; then
+        echo ""
+        echo "ℹ Note: Local state file exists but backend is configured"
+        echo "  State is stored remotely, local file may be outdated"
+    elif [ "${USE_LOCAL_STATE}" = "true" ] && [ -f "${TERRAFORM_DIR}/terraform.tfstate" ]; then
+        echo ""
+        echo "ℹ Using local state file"
+        echo "  If you configure a remote backend later, you'll need to migrate state:"
+        echo "    1. Uncomment backend block in main.tf"
+        echo "    2. Run: terraform init -migrate-state"
+    fi
+else
+    echo "ERROR: Terraform initialization failed"
+    exit 1
+fi
+
+# Check if EKS cluster already exists FIRST (before validation/plan)
+echo ""
+echo "Checking if EKS cluster already exists..."
+CLUSTER_EXISTS=$(aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" --query 'cluster.status' --output text 2>/dev/null || echo "NOT_FOUND")
+
+# Validate Terraform configuration
+echo ""
+echo "Validating Terraform configuration..."
+if terraform validate; then
+    echo "✓ Terraform configuration is valid"
+else
+    echo "ERROR: Terraform configuration validation failed"
+    exit 1
+fi
+
+if [ "${CLUSTER_EXISTS}" = "NOT_FOUND" ] || [ "${CLUSTER_EXISTS}" = "" ]; then
+    echo "⚠ EKS cluster does not exist yet"
+    echo "  Deploying in two phases (skipping plan to avoid Kubernetes provider errors)"
+    echo ""
+    
+    # Phase 1: Deploy AWS resources only
+    echo "=========================================="
+    echo "Phase 1: Deploying AWS Infrastructure"
+    echo "=========================================="
+    echo ""
+    echo "This will create: EKS cluster, VPC, S3 bucket, IAM roles"
+    echo "Note: Plan is skipped as Kubernetes provider will fail before cluster exists"
+    echo ""
+    read -p "Do you want to proceed? (yes/no): " CONFIRM
+    if [ "${CONFIRM}" != "yes" ]; then
+        echo "Deployment cancelled"
+        exit 0
+    fi
+    
+    echo ""
+    echo "Step 1: Creating VPC and IAM resources..."
+    terraform apply -auto-approve \
+        -target=module.vpc \
+        -target=aws_s3_bucket.flink_state \
+        -target=aws_iam_role.flink_s3_access \
+        -target=aws_iam_policy.flink_s3_access \
+        -target=aws_iam_role_policy_attachment.flink_s3_access
+    
+    echo ""
+    echo "Step 2: Creating EKS cluster (this may take 10-15 minutes)..."
+    echo "Note: Cluster will be created first, then addons and node groups"
+    # Apply EKS module - Terraform dependency graph will create cluster first
+    terraform apply -auto-approve -target=module.eks
+    
+    echo ""
+    echo "Waiting for EKS cluster to be ready..."
+    aws eks wait cluster-active --name "${CLUSTER_NAME}" --region "${REGION}"
+    echo "✓ Cluster is ACTIVE"
+    
+    # Wait for node groups to be ready
+    echo ""
+    echo "Waiting for EKS node groups to be ready..."
+    MAX_RETRIES=30
+    RETRY_COUNT=0
+    NODES_READY=false
+    
+    # Get list of node groups
+    NODE_GROUPS=$(aws eks list-nodegroups --cluster-name "${CLUSTER_NAME}" --region "${REGION}" --query 'nodegroups[]' --output text 2>/dev/null || echo "")
+    
+    if [ -n "${NODE_GROUPS}" ]; then
+        # Check each node group
+        while [ ${RETRY_COUNT} -lt ${MAX_RETRIES} ]; do
+            ALL_ACTIVE=true
+            for NODE_GROUP in ${NODE_GROUPS}; do
+                NODE_STATUS=$(aws eks describe-nodegroup \
+                    --cluster-name "${CLUSTER_NAME}" \
+                    --region "${REGION}" \
+                    --nodegroup-name "${NODE_GROUP}" \
+                    --query 'nodegroup.status' \
+                    --output text 2>/dev/null || echo "NOT_FOUND")
+                
+                if [ "${NODE_STATUS}" = "ACTIVE" ]; then
+                    continue
+                elif [ "${NODE_STATUS}" = "CREATING" ] || [ "${NODE_STATUS}" = "UPDATING" ]; then
+                    ALL_ACTIVE=false
+                    break
+                elif [ "${NODE_STATUS}" = "NOT_FOUND" ]; then
+                    ALL_ACTIVE=false
+                    break
+                else
+                    echo "  Node group ${NODE_GROUP} status: ${NODE_STATUS}"
+                fi
+            done
+            
+            if [ "${ALL_ACTIVE}" = "true" ]; then
+                NODES_READY=true
+                break
+            else
+                echo "  Node groups are still initializing... (attempt $((RETRY_COUNT + 1))/${MAX_RETRIES})"
+                sleep 30
+                RETRY_COUNT=$((RETRY_COUNT + 1))
+            fi
+        done
+    else
+        echo "  No node groups found yet, waiting a bit..."
+        sleep 30
+    fi
+    
+    if [ "${NODES_READY}" = "true" ]; then
+        echo "✓ Node groups are ACTIVE"
+    else
+        echo "⚠ Node groups may still be initializing, but proceeding..."
+    fi
+    echo ""
+    
+    # Update kubeconfig
+    echo "Updating kubeconfig..."
+    aws eks update-kubeconfig --name "${CLUSTER_NAME}" --region "${REGION}"
+    echo "✓ Kubeconfig updated"
+    echo ""
+    
+    # Wait a moment for kubeconfig to be ready and verify connection
+    echo "Verifying Kubernetes connection..."
+    sleep 10
+    if kubectl cluster-info &>/dev/null; then
+        echo "✓ Kubernetes connection verified"
+    else
+        echo "⚠ Kubernetes connection not ready yet, but proceeding..."
+    fi
+    echo ""
+    
+    # Refresh Terraform state to pick up EKS outputs
+    echo "Refreshing Terraform state..."
+    terraform refresh -target=module.eks || true
+    echo "✓ State refreshed"
+    echo ""
+    
+    # Phase 2: Deploy Kubernetes resources
+    echo "=========================================="
+    echo "Phase 2: Deploying Kubernetes Resources"
+    echo "=========================================="
+    echo ""
+    echo "This will create: Kubernetes namespace, service account, secrets"
+    echo ""
+    read -p "Do you want to proceed? (yes/no): " CONFIRM
+    if [ "${CONFIRM}" != "yes" ]; then
+        echo "Deployment cancelled"
+        exit 0
+    fi
+    
+    echo ""
+    echo "Applying Kubernetes resources..."
+    # Only target Kubernetes resources, not module outputs which are managed by the EKS module
+    terraform apply -auto-approve \
+        -target=null_resource.wait_for_cluster \
+        -target=kubernetes_namespace.fluss \
+        -target=kubernetes_service_account.flink \
+        -target=kubernetes_secret.flink_s3_credentials
+    
+    echo "✓ Kubernetes resources deployed"
+    echo ""
+    
+    # Final apply for any remaining resources (EBS CSI IRSA, outputs, etc.)
+    echo "=========================================="
+    echo "Final: Applying All Remaining Resources"
+    echo "=========================================="
+    echo ""
+    terraform apply -auto-approve
+    echo "✓ All resources deployed"
+    
+else
+    echo "✓ EKS cluster exists (status: ${CLUSTER_EXISTS})"
+    echo "  Proceeding with normal plan/apply..."
+    echo ""
+    
+    # Normal plan/apply flow
+    echo "Planning infrastructure changes..."
+    PLAN_OUTPUT=$(terraform plan -out=tfplan 2>&1)
+    PLAN_EXIT_CODE=$?
+    
+    if [ ${PLAN_EXIT_CODE} -eq 0 ]; then
+        echo "✓ Terraform plan created successfully"
+    elif echo "${PLAN_OUTPUT}" | grep -q "dial tcp.*connection refused"; then
+        echo "⚠ Plan shows Kubernetes connection errors"
+        echo "  This can happen if cluster is still initializing or kubeconfig needs update"
+        echo "  Attempting to update kubeconfig and retry..."
+        aws eks update-kubeconfig --name "${CLUSTER_NAME}" --region "${REGION}" || true
+        echo ""
+        echo "Retrying plan..."
+        if terraform plan -out=tfplan; then
+            echo "✓ Terraform plan created successfully after kubeconfig update"
+        else
+            echo "⚠ Plan still shows errors, but proceeding with apply"
+            echo "  Apply will work correctly as cluster exists"
+        fi
+    else
+        echo "ERROR: Terraform plan failed with unexpected errors:"
+        echo "${PLAN_OUTPUT}" | tail -20
+        exit 1
+    fi
+
+    # Show plan summary (only if plan file exists)
+    if [ -f "${TERRAFORM_DIR}/tfplan" ]; then
+        echo ""
+        echo "Plan summary:"
+        terraform show -no-color tfplan 2>/dev/null | grep -E "^Plan:|^  #|^  \+|^  \-|^  ~|^  ->" | head -20 || true
+        echo "  ... (showing first 20 lines, see tfplan for full details)"
+
+        # Check if plan has changes
+        PLAN_CHANGES=$(terraform show -no-color tfplan 2>/dev/null | grep -E "^Plan:" | head -1 || echo "")
+        if echo "${PLAN_CHANGES}" | grep -q "0 to add, 0 to change, 0 to destroy"; then
+            echo ""
+            echo "✓ No changes needed - infrastructure is already up to date"
+            echo "  Skipping apply step"
+        else
+            # Apply infrastructure
+            echo ""
+            echo "Applying infrastructure changes..."
+            echo "⚠ This will create/modify AWS resources. Review the plan above carefully."
+            read -p "Do you want to proceed? (yes/no): " CONFIRM
+            if [ "${CONFIRM}" != "yes" ]; then
+                echo "Deployment cancelled by user"
+                exit 0
+            fi
+            if terraform apply tfplan; then
+                echo "✓ Infrastructure deployed successfully"
+            else
+                echo "ERROR: Infrastructure deployment failed"
+                exit 1
+            fi
+        fi
+    else
+        echo ""
+        echo "⚠ Plan file not found, but proceeding with apply anyway"
+        echo "  This can happen if plan had errors but cluster exists"
+        echo ""
+        read -p "Do you want to proceed with apply? (yes/no): " CONFIRM
+        if [ "${CONFIRM}" != "yes" ]; then
+            echo "Deployment cancelled by user"
+            exit 0
+        fi
+        if terraform apply -auto-approve; then
+            echo "✓ Infrastructure deployed successfully"
+        else
+            echo "ERROR: Infrastructure deployment failed"
+            exit 1
+        fi
+    fi
+fi
+
+# Backup state file after successful apply (if using local state)
+if [ "${USE_LOCAL_STATE}" = "true" ] && [ -f "${TERRAFORM_DIR}/terraform.tfstate" ]; then
+    echo ""
+    echo "Backing up Terraform state after successful apply..."
+    BACKUP_FILE="${TERRAFORM_DIR}/terraform.tfstate.backup.$(date +%Y%m%d_%H%M%S)"
+    cp "${TERRAFORM_DIR}/terraform.tfstate" "${BACKUP_FILE}"
+    echo "✓ State backed up to: ${BACKUP_FILE}"
+fi
+
+# Get Terraform outputs
+echo ""
+echo "Retrieving Terraform outputs..."
+CLUSTER_NAME_OUTPUT=$(terraform output -raw eks_cluster_name 2>/dev/null || echo "")
+CLUSTER_ENDPOINT=$(terraform output -raw eks_cluster_endpoint 2>/dev/null || echo "")
+VPC_ID=$(terraform output -raw vpc_id 2>/dev/null || echo "")
+FLUSS_IMAGE_REPO=$(terraform output -raw fluss_image_repository 2>/dev/null || echo "")
+DEMO_IMAGE_REPO=$(terraform output -raw demo_image_repository 2>/dev/null || echo "")
+
+if [ -n "${CLUSTER_NAME_OUTPUT}" ]; then
+    echo "✓ EKS Cluster: ${CLUSTER_NAME_OUTPUT}"
+    echo "✓ Cluster Endpoint: ${CLUSTER_ENDPOINT}"
+fi
+
+if [ -n "${VPC_ID}" ]; then
+    echo "✓ VPC ID: ${VPC_ID}"
+fi
+
+# Verify cluster is ready (simple check)
+echo ""
+echo "Verifying EKS cluster status..."
+CLUSTER_STATUS=$(aws eks describe-cluster --name "${CLUSTER_NAME_OUTPUT:-${CLUSTER_NAME}}" --region "${REGION}" --query 'cluster.status' --output text 2>/dev/null || echo "NOT_FOUND")
+
+if [ "${CLUSTER_STATUS}" = "ACTIVE" ]; then
+    echo "✓ EKS cluster is ACTIVE"
+elif [ "${CLUSTER_STATUS}" = "CREATING" ]; then
+    echo "  Cluster is CREATING - it will become ACTIVE shortly"
+    echo "  You can proceed to next steps once cluster is ACTIVE"
+elif [ "${CLUSTER_STATUS}" = "NOT_FOUND" ]; then
+    echo "⚠ Cluster not found yet - it may still be creating"
+else
+    echo "⚠ Cluster status: ${CLUSTER_STATUS}"
+fi
+
+# Summary
+echo ""
+echo "=========================================="
+echo "Infrastructure Deployment Summary"
+echo "=========================================="
+echo ""
+echo "✓ Terraform infrastructure deployed successfully"
+echo ""
+echo "Cluster Information:"
+echo "  Name: ${CLUSTER_NAME_OUTPUT:-${CLUSTER_NAME}}"
+echo "  Region: ${REGION}"
+if [ -n "${CLUSTER_ENDPOINT}" ]; then
+    echo "  Endpoint: ${CLUSTER_ENDPOINT}"
+fi
+echo ""
+echo "Image Repositories:"
+if [ -n "${FLUSS_IMAGE_REPO}" ] && [ "${FLUSS_IMAGE_REPO}" != "Not configured" ]; then
+    echo "  Fluss: ${FLUSS_IMAGE_REPO}"
+else
+    echo "  Fluss: Using Docker Hub (apache/fluss)"
+fi
+if [ -n "${DEMO_IMAGE_REPO}" ] && [ "${DEMO_IMAGE_REPO}" != "Not configured" ]; then
+    echo "  Demo: ${DEMO_IMAGE_REPO}"
+else
+    echo "  Demo: Not configured - set demo_image_repository in terraform.tfvars"
+fi
+echo ""
+echo "Next Steps:"
+echo "  1. Ensure Docker images are built and pushed to ECR (if using ECR)"
+echo "  2. Proceed to Step 1: Update kubeconfig"
+echo "     Run: ./01-update-kubeconfig.sh"
+echo "     Or continue with: ./deploy-benchmark.sh --start-from-step 1"
+echo ""
+echo "To view all Terraform outputs:"
+echo "  cd ${TERRAFORM_DIR}"
+echo "  terraform output"
+echo ""
+
+# Final state preservation reminder
+if [ "${USE_LOCAL_STATE}" = "true" ]; then
+    echo "=========================================="
+    echo "⚠ IMPORTANT: Terraform State Preservation"
+    echo "=========================================="
+    echo ""
+    echo "Your Terraform state is stored locally at:"
+    echo "  ${TERRAFORM_DIR}/terraform.tfstate"
+    echo ""
+    echo "⚠ CRITICAL: This file is essential for managing your infrastructure!"
+    echo "  - DO NOT delete this file"
+    echo "  - DO NOT commit it to version control (contains sensitive data)"
+    echo "  - Backup this file regularly"
+    echo ""
+    echo "To configure remote state storage (recommended):"
+    echo "  1. Create an S3 bucket for Terraform state"
+    echo "  2. Uncomment and configure the backend block in main.tf"
+    echo "  3. Run: terraform init -migrate-state"
+    echo ""
+fi
+
+echo "✓ Step 0 completed: Infrastructure deployed successfully"
+
diff --git a/e2e-iot/high-infra/k8s/scripts/01-update-kubeconfig.sh b/e2e-iot/high-infra/k8s/scripts/01-update-kubeconfig.sh
new file mode 100755
index 0000000..040e506
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/01-update-kubeconfig.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+TERRAFORM_DIR="${K8S_DIR}/../terraform"
+
+CLUSTER_NAME="${CLUSTER_NAME:-fluss-eks-cluster}"
+REGION="${REGION:-us-west-2}"
+
+echo "=== Step 1: Updating kubeconfig ==="
+echo "Cluster: ${CLUSTER_NAME}"
+echo "Region: ${REGION}"
+echo ""
+
+# Check AWS CLI is available
+if ! command -v aws &> /dev/null; then
+    echo "ERROR: AWS CLI is not installed or not in PATH"
+    exit 1
+fi
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Update kubeconfig
+echo "Updating kubeconfig for EKS cluster..."
+aws eks update-kubeconfig --region "${REGION}" --name "${CLUSTER_NAME}"
+
+# Verify connection
+echo "Verifying cluster connection..."
+if kubectl cluster-info &> /dev/null; then
+    echo "✓ Successfully connected to cluster"
+    kubectl cluster-info
+    echo ""
+    echo "Cluster nodes:"
+    kubectl get nodes
+else
+    echo "ERROR: Failed to connect to cluster"
+    exit 1
+fi
+
+echo ""
+echo "✓ Step 1 completed: kubeconfig updated successfully"
+
+
diff --git a/e2e-iot/high-infra/k8s/scripts/02-setup-storage.sh b/e2e-iot/high-infra/k8s/scripts/02-setup-storage.sh
new file mode 100755
index 0000000..89258d1
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/02-setup-storage.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+STORAGE_DIR="${K8S_DIR}/storage"
+
+echo "=== Step 2: Setting up local NVMe storage ==="
+echo ""
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Check storage setup script exists
+if [ ! -f "${STORAGE_DIR}/setup-local-storage.sh" ]; then
+    echo "ERROR: setup-local-storage.sh not found at ${STORAGE_DIR}/setup-local-storage.sh"
+    exit 1
+fi
+
+# Run storage setup script
+echo "Running storage setup script..."
+cd "${STORAGE_DIR}"
+./setup-local-storage.sh
+
+# Verify storage setup
+echo ""
+echo "Verifying storage setup..."
+if kubectl get storageclass local-storage &> /dev/null; then
+    echo "✓ StorageClass 'local-storage' created"
+else
+    echo "ERROR: StorageClass 'local-storage' not found"
+    exit 1
+fi
+
+if kubectl get pv -l component=tablet-server &> /dev/null; then
+    PV_COUNT=$(kubectl get pv -l component=tablet-server --no-headers 2>/dev/null | wc -l | awk '{print $1}')
+    echo "✓ Found ${PV_COUNT} PersistentVolumes for tablet servers"
+    kubectl get pv -l component=tablet-server
+else
+    echo "ERROR: No PersistentVolumes found for tablet servers"
+    exit 1
+fi
+
+echo ""
+echo "✓ Step 2 completed: Local NVMe storage setup completed successfully"
+
diff --git a/e2e-iot/high-infra/k8s/scripts/03-deploy-components.sh b/e2e-iot/high-infra/k8s/scripts/03-deploy-components.sh
new file mode 100755
index 0000000..adfa764
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/03-deploy-components.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+FLINK_DIR="${K8S_DIR}/flink"
+
+NAMESPACE="${NAMESPACE:-fluss}"
+DEMO_IMAGE_REPO="${DEMO_IMAGE_REPO:-}"
+DEMO_IMAGE_TAG="${DEMO_IMAGE_TAG:-latest}"
+FLUSS_IMAGE_REPO="${FLUSS_IMAGE_REPO:-apache/fluss:0.8.0-incubating}"
+AWS_REGION="${REGION:-us-west-2}"
+
+echo "=== Step 3: Deploying all components ==="
+echo "Namespace: ${NAMESPACE}"
+echo "Demo Image (for Flink job submission): ${DEMO_IMAGE_REPO:-<not set>}:${DEMO_IMAGE_TAG}"
+echo "Fluss Image: ${FLUSS_IMAGE_REPO}"
+echo "Flink Cluster Image: apache/flink:1.20.3-scala_2.12-java17 (hardcoded)"
+echo ""
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Check helm is available
+if ! command -v helm &> /dev/null; then
+    echo "ERROR: helm is not installed or not in PATH"
+    exit 1
+fi
+
+# Check deploy script exists
+if [ ! -f "${K8S_DIR}/deploy.sh" ]; then
+    echo "ERROR: deploy.sh not found at ${K8S_DIR}/deploy.sh"
+    exit 1
+fi
+
+# Run deployment script (skip producer and Flink job submission)
+# Producer will be deployed separately in step 5
+echo "[2/2] Deploying ZooKeeper, Fluss, Flink cluster, and Monitoring..."
+cd "${K8S_DIR}"
+
+# Call deploy.sh - it will skip producer deployment since deploy.sh checks for DEMO_IMAGE_REPO
+# Flink cluster uses hardcoded image: apache/flink:1.20.3-scala_2.12-java17
+# Pass DEMO_IMAGE_REPO so Flink init container can use it
+./deploy.sh "${NAMESPACE}" "${DEMO_IMAGE_REPO}" "${DEMO_IMAGE_TAG}" "${FLUSS_IMAGE_REPO}"
+
+# Wait for critical components to be ready
+echo ""
+echo "Waiting for components to be ready..."
+echo "  Waiting for ZooKeeper..."
+kubectl wait --for=condition=ready pod -l app=zookeeper -n "${NAMESPACE}" --timeout=120s || {
+    echo "WARNING: ZooKeeper pods may not be ready yet"
+}
+
+echo "  Waiting for Fluss Coordinator..."
+kubectl wait --for=condition=ready pod -l app.kubernetes.io/component=coordinator -n "${NAMESPACE}" --timeout=300s || {
+    echo "WARNING: Fluss Coordinator pods may not be ready yet"
+}
+
+echo "  Waiting for Flink JobManager..."
+kubectl wait --for=condition=ready pod -l app=flink,component=jobmanager -n "${NAMESPACE}" --timeout=300s || {
+    echo "WARNING: Flink JobManager pods may not be ready yet"
+}
+
+echo ""
+echo "Component status:"
+kubectl get pods -n "${NAMESPACE}"
+kubectl get pods -n monitoring 2>/dev/null || echo "Monitoring namespace not ready yet"
+
+echo ""
+echo "✓ Step 3 completed: All components deployed"
+
diff --git a/e2e-iot/high-infra/k8s/scripts/04-verify-storage.sh b/e2e-iot/high-infra/k8s/scripts/04-verify-storage.sh
new file mode 100755
index 0000000..144949d
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/04-verify-storage.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+NAMESPACE="${NAMESPACE:-fluss}"
+
+echo "=== Step 4: Verifying NVMe storage for tablet servers ==="
+echo ""
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Verify PersistentVolumes are using NVMe
+echo "Checking PersistentVolumes..."
+PV_COUNT=$(kubectl get pv -l component=tablet-server --no-headers 2>/dev/null | wc -l | awk '{print $1}')
+if [ "${PV_COUNT}" -eq "0" ]; then
+    echo "ERROR: No PersistentVolumes found for tablet servers"
+    exit 1
+fi
+
+echo "✓ Found ${PV_COUNT} PersistentVolumes for tablet servers"
+kubectl get pv -l component=tablet-server
+
+# Check PV details for NVMe path
+echo ""
+echo "Verifying PV paths (should show /opt/alldata/fluss/data)..."
+NVME_PVS=$(kubectl get pv -l component=tablet-server -o jsonpath='{.items[*].spec.local.path}' 2>/dev/null || echo "")
+if echo "${NVME_PVS}" | grep -q "/opt/alldata/fluss/data"; then
+    echo "✓ PVs are configured with NVMe paths"
+else
+    echo "WARNING: PV paths may not be configured correctly"
+    kubectl get pv -l component=tablet-server -o yaml | grep -A 5 "path:" || true
+fi
+
+# Verify PVCs are bound
+echo ""
+echo "Checking PersistentVolumeClaims..."
+PVC_COUNT=$(kubectl get pvc -n "${NAMESPACE}" --no-headers 2>/dev/null | wc -l | awk '{print $1}')
+if [ "${PVC_COUNT}" -eq "0" ]; then
+    echo "WARNING: No PersistentVolumeClaims found in namespace ${NAMESPACE}"
+else
+    echo "✓ Found ${PVC_COUNT} PersistentVolumeClaims"
+    kubectl get pvc -n "${NAMESPACE}"
+    
+    # Check if PVCs are bound
+    BOUND_COUNT=$(kubectl get pvc -n "${NAMESPACE}" -o jsonpath='{.items[?(@.status.phase=="Bound")].metadata.name}' 2>/dev/null | wc -w)
+    if [ "${BOUND_COUNT}" -lt "${PVC_COUNT}" ]; then
+        echo "WARNING: Not all PVCs are bound (${BOUND_COUNT}/${PVC_COUNT})"
+    else
+        echo "✓ All PVCs are bound"
+    fi
+fi
+
+# Verify tablet server pods have volumes mounted
+echo ""
+echo "Checking tablet server pods..."
+# Try multiple label selectors to find tablet server pods
+TABLET_PODS=$(kubectl get pods -n "${NAMESPACE}" -l app.kubernetes.io/component=tablet -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || echo "")
+if [ -z "${TABLET_PODS}" ]; then
+    # Fallback: try finding pods by name pattern
+    TABLET_PODS=$(kubectl get pods -n "${NAMESPACE}" -o jsonpath='{.items[?(@.metadata.name=~"tablet-server.*")].metadata.name}' 2>/dev/null || echo "")
+fi
+
+if [ -z "${TABLET_PODS}" ]; then
+    echo "WARNING: No tablet server pods found"
+    echo "  Attempted to find pods with label: app.kubernetes.io/component=tablet"
+    echo "  Also tried to find pods matching pattern: tablet-server*"
+    echo ""
+    echo "  Available pods in namespace ${NAMESPACE}:"
+    kubectl get pods -n "${NAMESPACE}" | grep -E "NAME|tablet" || kubectl get pods -n "${NAMESPACE}"
+else
+    echo "✓ Found tablet server pods:"
+    kubectl get pods -n "${NAMESPACE}" -l app.kubernetes.io/component=tablet -o wide 2>/dev/null || \
+    kubectl get pods -n "${NAMESPACE}" -o wide | grep tablet-server
+    
+    # Check first pod for volume mounts
+    FIRST_POD=$(echo "${TABLET_PODS}" | awk '{print $1}')
+    if [ -n "${FIRST_POD}" ]; then
+        echo ""
+        echo "Checking volume mounts in pod ${FIRST_POD}..."
+        if kubectl exec -n "${NAMESPACE}" "${FIRST_POD}" -- df -h | grep -q "alldata"; then
+            echo "✓ NVMe storage is mounted in tablet server pod"
+            kubectl exec -n "${NAMESPACE}" "${FIRST_POD}" -- df -h | grep "alldata"
+        else
+            echo "WARNING: NVMe storage may not be mounted correctly"
+        fi
+    fi
+fi
+
+echo ""
+echo "✓ Step 4 completed: NVMe storage verification completed"
+
diff --git a/e2e-iot/high-infra/k8s/scripts/05-deploy-producer.sh b/e2e-iot/high-infra/k8s/scripts/05-deploy-producer.sh
new file mode 100755
index 0000000..f8d506e
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/05-deploy-producer.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+JOBS_DIR="${K8S_DIR}/jobs"
+
+NAMESPACE="${NAMESPACE:-fluss}"
+DEMO_IMAGE_REPO="${DEMO_IMAGE_REPO:-}"
+DEMO_IMAGE_TAG="${DEMO_IMAGE_TAG:-latest}"
+
+# Producer configuration defaults
+export BUCKETS="${BUCKETS:-128}"
+export PRODUCER_RATE="${PRODUCER_RATE:-250000}"
+export TOTAL_PRODUCERS="${TOTAL_PRODUCERS:-8}"
+export PRODUCER_FLUSH_EVERY="${PRODUCER_FLUSH_EVERY:-5000}"
+export CLIENT_WRITER_BATCH_TIMEOUT="${CLIENT_WRITER_BATCH_TIMEOUT:-90ms}"
+export BOOTSTRAP="${BOOTSTRAP:-coordinator-server-hs.fluss.svc.cluster.local:9124}"
+export DATABASE="${DATABASE:-iot}"
+export TABLE="${TABLE:-sensor_readings}"
+
+echo "=== Step 5: Deploying multi-instance producer ==="
+echo "Namespace: ${NAMESPACE}"
+echo "Demo Image: ${DEMO_IMAGE_REPO}:${DEMO_IMAGE_TAG}"
+echo "Buckets: ${BUCKETS}"
+echo "Total Producers: ${TOTAL_PRODUCERS}"
+echo "Rate per Producer: ${PRODUCER_RATE} records/sec"
+echo ""
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Check if demo image repo is set
+if [ -z "${DEMO_IMAGE_REPO}" ]; then
+    echo "ERROR: DEMO_IMAGE_REPO is not set"
+    exit 1
+fi
+
+# Check scripts exist
+if [ ! -f "${JOBS_DIR}/create-table.sh" ]; then
+    echo "ERROR: create-table.sh not found at ${JOBS_DIR}/create-table.sh"
+    exit 1
+fi
+
+if [ ! -f "${JOBS_DIR}/deploy-producer-multi-instance.sh" ]; then
+    echo "ERROR: deploy-producer-multi-instance.sh not found at ${JOBS_DIR}/deploy-producer-multi-instance.sh"
+    exit 1
+fi
+
+# Export all variables needed
+export NAMESPACE DEMO_IMAGE_REPO DEMO_IMAGE_TAG
+
+# Step 5.1: Create table
+echo "[5.1/2] Creating Fluss table with ${BUCKETS} buckets..."
+cd "${JOBS_DIR}"
+"${JOBS_DIR}/create-table.sh" \
+    --namespace "${NAMESPACE}" \
+    --bootstrap "${BOOTSTRAP}" \
+    --database "${DATABASE}" \
+    --table "${TABLE}" \
+    --buckets "${BUCKETS}" \
+    --image-repo "${DEMO_IMAGE_REPO}" \
+    --image-tag "${DEMO_IMAGE_TAG}"
+
+if [ $? -ne 0 ]; then
+    echo "ERROR: Table creation failed"
+    exit 1
+fi
+
+echo "✓ Table created successfully"
+
+# Step 5.2: Deploy multi-instance producer
+echo ""
+echo "[5.2/2] Deploying ${TOTAL_PRODUCERS} producer instances..."
+"${JOBS_DIR}/deploy-producer-multi-instance.sh" --wait
+
+if [ $? -ne 0 ]; then
+    echo "ERROR: Producer deployment failed"
+    exit 1
+fi
+
+# Verify producer pods are running
+echo ""
+echo "Verifying producer pods..."
+sleep 5
+PRODUCER_PODS=$(kubectl get pods -n "${NAMESPACE}" -l app=fluss-producer --no-headers 2>/dev/null | wc -l | awk '{print $1}')
+if [ "${PRODUCER_PODS}" -eq "0" ]; then
+    echo "ERROR: No producer pods found"
+    exit 1
+fi
+
+echo "✓ Found ${PRODUCER_PODS} producer pods"
+kubectl get pods -n "${NAMESPACE}" -l app=fluss-producer -o wide
+
+# Check pod distribution
+echo ""
+echo "Producer pod distribution:"
+kubectl get pods -n "${NAMESPACE}" -l app=fluss-producer -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}{end}' | sort -k2
+
+echo ""
+echo "✓ Step 5 completed: Multi-instance producer deployed successfully"
+
diff --git a/e2e-iot/high-infra/k8s/scripts/06-submit-flink-job.sh b/e2e-iot/high-infra/k8s/scripts/06-submit-flink-job.sh
new file mode 100755
index 0000000..cc9ba83
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/06-submit-flink-job.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+FLINK_DIR="${K8S_DIR}/flink"
+
+NAMESPACE="${NAMESPACE:-fluss}"
+DEMO_IMAGE_REPO="${DEMO_IMAGE_REPO:-}"
+DEMO_IMAGE_TAG="${DEMO_IMAGE_TAG:-latest}"
+
+echo "=== Step 6: Submitting Flink aggregator job ==="
+echo ""
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Check DEMO_IMAGE_REPO is set
+if [ -z "${DEMO_IMAGE_REPO}" ]; then
+    echo "ERROR: DEMO_IMAGE_REPO environment variable is not set"
+    echo "Please set it with: export DEMO_IMAGE_REPO=your-repo/fluss-demo"
+    exit 1
+fi
+
+# Verify Flink JobManager is ready
+echo "Checking Flink JobManager status..."
+JOBMANAGER_POD=$(kubectl get pods -n "${NAMESPACE}" -l app=flink,component=jobmanager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+if [ -z "${JOBMANAGER_POD}" ]; then
+    echo "ERROR: Flink JobManager pod not found"
+    exit 1
+fi
+
+JOBMANAGER_STATUS=$(kubectl get pod -n "${NAMESPACE}" "${JOBMANAGER_POD}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "")
+if [ "${JOBMANAGER_STATUS}" != "Running" ]; then
+    echo "ERROR: Flink JobManager pod is not Running (status: ${JOBMANAGER_STATUS})"
+    exit 1
+fi
+
+echo "✓ Flink JobManager is ready: ${JOBMANAGER_POD}"
+echo ""
+
+# Check if JAR exists in JobManager pod (mounted via init container)
+if kubectl exec -n "${NAMESPACE}" "${JOBMANAGER_POD}" -- test -f /opt/flink/usrlib/fluss-flink-realtime-demo.jar 2>/dev/null; then
+    echo "✓ JAR found in JobManager pod at /opt/flink/usrlib/fluss-flink-realtime-demo.jar"
+else
+    echo "WARNING: JAR not found in JobManager pod at /opt/flink/usrlib/fluss-flink-realtime-demo.jar"
+    echo "The JobManager should have the JAR mounted via init container."
+    echo "Please ensure DEMO_IMAGE_REPO is set and redeploy the JobManager."
+    exit 1
+fi
+echo ""
+
+# Submit job from inside JobManager pod (same approach as submit-job-local.sh)
+echo "[2/4] Submitting Flink job from inside JobManager pod..."
+kubectl exec -n "${NAMESPACE}" "${JOBMANAGER_POD}" -- sh -c "
+set -e
+
+echo '[2/4] Cancelling existing Flink jobs...'
+EXISTING_JOBS=\$(curl -s http://localhost:8081/jobs 2>/dev/null | grep -o '\"id\":\"[^\"]*' | sed 's/\"id\":\"//' || echo '')
+
+if [ -n \"\${EXISTING_JOBS}\" ]; then
+    echo \"\${EXISTING_JOBS}\" | while read job_id; do
+        if [ -n \"\${job_id}\" ]; then
+            JOB_STATUS=\$(curl -s \"http://localhost:8081/jobs/\${job_id}\" 2>/dev/null | grep -o '\"status\":\"[^\"]*' | sed 's/\"status\":\"//' || echo '')
+            if [ \"\${JOB_STATUS}\" = \"RUNNING\" ] || [ \"\${JOB_STATUS}\" = \"CREATED\" ]; then
+                echo \"  Cancelling job: \${job_id}\"
+                curl -s -X PATCH \"http://localhost:8081/jobs/\${job_id}\" > /dev/null 2>&1 || true
+            fi
+        fi
+    done
+    sleep 3
+else
+    echo '  ℹ No running jobs found'
+fi
+echo ''
+
+echo '[3/4] Uploading JAR to Flink cluster...'
+UPLOAD_RESPONSE=\$(curl -s -X POST \
+    \"http://localhost:8081/v1/jars/upload\" \
+    -H \"Content-Type: multipart/form-data\" \
+    -F \"jarfile=@/opt/flink/usrlib/fluss-flink-realtime-demo.jar\")
+
+JAR_ID=\$(echo \"\${UPLOAD_RESPONSE}\" | grep -o 'flink-web-upload/[^\"]*' | sed 's|flink-web-upload/||' || echo '')
+
+if [ -z \"\${JAR_ID}\" ]; then
+    JAR_ID=\$(echo \"\${UPLOAD_RESPONSE}\" | grep -o '\"filename\":\"[^\"]*' | sed 's/\"filename\":\"//' | sed 's|.*/||' || echo '')
+fi
+
+if [ -z \"\${JAR_ID}\" ] || [ \"\${JAR_ID}\" = \"null\" ]; then
+    echo \"ERROR: Failed to extract JAR ID from upload response\"
+    echo \"Response: \${UPLOAD_RESPONSE}\"
+    exit 1
+fi
+
+echo \"✓ JAR uploaded with ID: \${JAR_ID}\"
+echo ''
+
+echo '[4/4] Submitting job via REST API...'
+JOB_RESPONSE=\$(curl -s -X POST \
+    \"http://localhost:8081/v1/jars/\${JAR_ID}/run\" \
+    -H \"Content-Type: application/json\" \
+    -d '{
+        \"entryClass\": \"org.apache.fluss.benchmark.e2eplatformaws.flink.FlinkSensorAggregatorJob\",
+        \"programArgs\": \"--bootstrap coordinator-server-hs.${NAMESPACE}.svc.cluster.local:9124 --database iot --table sensor_readings --window-minutes 1\",
+        \"parallelism\": 192
+    }')
+
+JOB_ID=\$(echo \"\${JOB_RESPONSE}\" | grep -o '\"jobid\":\"[^\"]*' | sed 's/\"jobid\":\"//' || echo '')
+
+if [ -z \"\${JOB_ID}\" ] || [ \"\${JOB_ID}\" = \"null\" ]; then
+    echo \"ERROR: Failed to extract Job ID from submission response\"
+    echo \"Response: \${JOB_RESPONSE}\"
+    exit 1
+fi
+
+echo ''
+echo \"✓ Job submitted successfully!\"
+echo \"Job ID: \${JOB_ID}\"
+echo ''
+echo \"Job status URL: http://localhost:8081/#/job/\${JOB_ID}\"
+
+# Wait and check job status
+sleep 5
+JOB_STATUS=\$(curl -s \"http://localhost:8081/jobs/\${JOB_ID}\" 2>/dev/null | grep -o '\"state\":\"[^\"]*' | sed 's/\"state\":\"//' || echo 'UNKNOWN')
+echo \"Job status: \${JOB_STATUS}\"
+"
+
+if [ $? -ne 0 ]; then
+    echo ""
+    echo "ERROR: Flink job submission failed"
+    exit 1
+fi
+
+echo ""
+echo "✓ Step 6 completed: Flink aggregator job submitted successfully"
+echo ""
+echo "Monitor job at:"
+echo "  kubectl port-forward -n ${NAMESPACE} svc/flink-jobmanager 8081:8081"
+echo "  Then open: http://localhost:8081"
diff --git a/e2e-iot/high-infra/k8s/scripts/07-deploy-dashboard.sh b/e2e-iot/high-infra/k8s/scripts/07-deploy-dashboard.sh
new file mode 100755
index 0000000..bcbb70a
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/07-deploy-dashboard.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+MONITORING_DIR="${K8S_DIR}/monitoring"
+
+NAMESPACE="${NAMESPACE:-fluss}"
+
+echo "=== Step 7: Deploying Grafana dashboard ==="
+echo ""
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Check dashboard deployment script exists
+if [ ! -f "${MONITORING_DIR}/deploy-dashboard.sh" ]; then
+    echo "WARNING: deploy-dashboard.sh not found at ${MONITORING_DIR}/deploy-dashboard.sh"
+    echo "Attempting to deploy dashboard manually..."
+    
+    # Deploy dashboard ConfigMap if script doesn't exist
+    if [ -f "${MONITORING_DIR}/grafana-dashboard.yaml" ]; then
+        echo "Deploying Grafana dashboard ConfigMap..."
+        kubectl apply -f "${MONITORING_DIR}/grafana-dashboard.yaml"
+        echo "✓ Dashboard ConfigMap deployed"
+    else
+        echo "ERROR: grafana-dashboard.yaml not found"
+        exit 1
+    fi
+else
+    # Use the deployment script
+    echo "Running dashboard deployment script..."
+    cd "${MONITORING_DIR}"
+    # Dashboard ConfigMap goes in "monitoring" namespace, not the main namespace
+    unset NAMESPACE
+    export NAMESPACE="monitoring"
+    ./deploy-dashboard.sh
+    
+    if [ $? -ne 0 ]; then
+        echo "ERROR: Dashboard deployment failed"
+        exit 1
+    fi
+fi
+
+# Verify Grafana pod is running
+echo ""
+echo "Checking Grafana status..."
+GRAFANA_POD=$(kubectl get pod -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+if [ -z "${GRAFANA_POD}" ]; then
+    echo "WARNING: Grafana pod not found in monitoring namespace"
+else
+    GRAFANA_STATUS=$(kubectl get pod -n monitoring "${GRAFANA_POD}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "")
+    if [ "${GRAFANA_STATUS}" = "Running" ]; then
+        echo "✓ Grafana is running: ${GRAFANA_POD}"
+    else
+        echo "WARNING: Grafana pod is not Running (status: ${GRAFANA_STATUS})"
+    fi
+fi
+
+# Verify dashboard ConfigMap
+echo ""
+echo "Checking dashboard ConfigMap..."
+if kubectl get configmap -n monitoring fluss-flink-dashboard &> /dev/null; then
+    echo "✓ Dashboard ConfigMap exists"
+else
+    echo "WARNING: Dashboard ConfigMap not found"
+fi
+
+echo ""
+echo "✓ Step 7 completed: Grafana dashboard deployed successfully"
+
+
diff --git a/e2e-iot/high-infra/k8s/scripts/08-verify-deployment.sh b/e2e-iot/high-infra/k8s/scripts/08-verify-deployment.sh
new file mode 100755
index 0000000..285707b
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/08-verify-deployment.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+NAMESPACE="${NAMESPACE:-fluss}"
+
+echo "=== Step 8: Verifying deployment ==="
+echo ""
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+VERIFICATION_FAILED=0
+
+# Check all pods in fluss namespace
+echo "Checking pods in ${NAMESPACE} namespace..."
+PODS=$(kubectl get pods -n "${NAMESPACE}" --no-headers 2>/dev/null || echo "")
+if [ -z "${PODS}" ]; then
+    echo "ERROR: No pods found in ${NAMESPACE} namespace"
+    VERIFICATION_FAILED=1
+else
+    echo "✓ Found pods in ${NAMESPACE} namespace:"
+    kubectl get pods -n "${NAMESPACE}"
+    
+    # Check for pods not in Running state
+    NOT_RUNNING=$(kubectl get pods -n "${NAMESPACE}" --no-headers 2>/dev/null | grep -v "Running\|Completed" | wc -l | awk '{print $1}')
+    if [ "${NOT_RUNNING}" -gt "0" ]; then
+        echo ""
+        echo "WARNING: Some pods are not in Running/Completed state:"
+        kubectl get pods -n "${NAMESPACE}" | grep -v "Running\|Completed"
+        VERIFICATION_FAILED=1
+    fi
+fi
+
+# Check monitoring namespace
+echo ""
+echo "Checking pods in monitoring namespace..."
+MONITORING_PODS=$(kubectl get pods -n monitoring --no-headers 2>/dev/null || echo "")
+if [ -z "${MONITORING_PODS}" ]; then
+    echo "WARNING: No pods found in monitoring namespace"
+else
+    echo "✓ Found pods in monitoring namespace:"
+    kubectl get pods -n monitoring
+fi
+
+# Check node placement
+echo ""
+echo "Checking node placement..."
+echo "Coordinator nodes:"
+kubectl get nodes -l node-type=coordinator 2>/dev/null || echo "  No coordinator nodes found"
+echo "Tablet server nodes:"
+kubectl get nodes -l node-type=tablet-server 2>/dev/null || echo "  No tablet server nodes found"
+echo "Flink nodes:"
+kubectl get nodes -l node-type=flink-jobmanager 2>/dev/null || echo "  No Flink JobManager nodes found"
+kubectl get nodes -l node-type=flink-taskmanager 2>/dev/null || echo "  No Flink TaskManager nodes found"
+echo "Producer nodes:"
+kubectl get nodes -l node-type=producer 2>/dev/null || echo "  No producer nodes found"
+
+# Check ServiceMonitors and PodMonitors
+echo ""
+echo "Checking ServiceMonitors and PodMonitors..."
+if kubectl get servicemonitor -n "${NAMESPACE}" &> /dev/null; then
+    echo "✓ ServiceMonitors found:"
+    kubectl get servicemonitor -n "${NAMESPACE}"
+else
+    echo "WARNING: No ServiceMonitors found"
+fi
+
+if kubectl get podmonitor -n "${NAMESPACE}" &> /dev/null; then
+    echo "✓ PodMonitors found:"
+    kubectl get podmonitor -n "${NAMESPACE}"
+else
+    echo "WARNING: No PodMonitors found"
+fi
+
+# Check data flow
+echo ""
+echo "Checking data flow..."
+echo "Producer logs (last 10 lines):"
+kubectl logs -n "${NAMESPACE}" -l app=fluss-producer --tail=10 2>/dev/null | grep -i "records\|throughput" || echo "  No producer logs found"
+
+echo ""
+echo "Flink TaskManager logs (last 10 lines):"
+kubectl logs -n "${NAMESPACE}" -l app=flink,component=taskmanager --tail=10 2>/dev/null | grep -i "aggregate\|records" || echo "  No Flink logs found"
+
+# Summary
+echo ""
+echo "=== Verification Summary ==="
+if [ "${VERIFICATION_FAILED}" -eq "0" ]; then
+    echo "✓ All verifications passed"
+    echo ""
+    echo "Access services:"
+    echo "  Flink Web UI: kubectl port-forward -n ${NAMESPACE} svc/flink-jobmanager 8081:8081"
+    echo "  Grafana: kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80"
+    echo "  Prometheus: kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090"
+    echo ""
+    echo "✓ Step 8 completed: Deployment verification completed successfully"
+else
+    echo "⚠ Some verifications failed - please check the output above"
+    exit 1
+fi
+
diff --git a/e2e-iot/high-infra/k8s/scripts/09-view-metrics.sh b/e2e-iot/high-infra/k8s/scripts/09-view-metrics.sh
new file mode 100755
index 0000000..cfa7044
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/09-view-metrics.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Calculate base directory (e2e-platform-aws) - go up from scripts/k8s/high-infra
+BASE_DIR="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
+
+echo "=== Step 9: View End-to-End Metrics ==="
+echo ""
+
+# Change to base directory
+cd "${BASE_DIR}"
+
+# Check if port-forward script exists
+if [ ! -f "./port-forward-grafana.sh" ]; then
+    echo "ERROR: Port-forward script not found"
+    echo "Expected location: ./port-forward-grafana.sh in e2e-platform-aws directory"
+    echo "Current directory: $(pwd)"
+    exit 1
+fi
+
+# Check if script is executable
+if [ ! -x "./port-forward-grafana.sh" ]; then
+    echo "Making port-forward script executable..."
+    chmod +x "./port-forward-grafana.sh"
+fi
+
+echo "Found port-forward script in: $(pwd)"
+echo ""
+echo "Starting Grafana port-forward to view end-to-end metrics..."
+echo ""
+
+# Execute the port-forward script
+exec ./port-forward-grafana.sh
+
diff --git a/e2e-iot/high-infra/k8s/scripts/SCRIPTS.md b/e2e-iot/high-infra/k8s/scripts/SCRIPTS.md
new file mode 100644
index 0000000..5f259fc
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/SCRIPTS.md
@@ -0,0 +1,340 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+# Benchmark Deployment Scripts
+
+This directory contains individual deployment scripts for each step of the 2-million-messages-per-second benchmark deployment, along with a master script that orchestrates all steps with error handling.
+
+## Master Script
+
+### `deploy-benchmark.sh`
+
+The master script that runs all deployment steps in sequence with comprehensive error handling.
+
+**Usage:**
+
+```bash
+# Run all steps
+./deploy-benchmark.sh
+
+# Skip specific steps
+./deploy-benchmark.sh --skip-step 4 --skip-step 7
+
+# Start from a specific step (skip previous steps)
+./deploy-benchmark.sh --start-from-step 5
+
+# Run only a specific step
+./deploy-benchmark.sh --only-step 3
+
+# Show help
+./deploy-benchmark.sh --help
+```
+
+**Environment Variables:**
+
+- `NAMESPACE` - Kubernetes namespace (default: `fluss`)
+- `DEMO_IMAGE_REPO` - Demo image repository (required for step 5)
+- `DEMO_IMAGE_TAG` - Demo image tag (default: `latest`)
+- `FLUSS_IMAGE_REPO` - Fluss image repository (default: `apache/fluss:0.8.0-incubating`)
+- `CLUSTER_NAME` - EKS cluster name (default: `fluss-eks-cluster`)
+- `REGION` - AWS region (default: `us-west-2`)
+
+**Example:**
+
+```bash
+export DEMO_IMAGE_REPO=343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo
+export FLUSS_IMAGE_REPO=343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss
+./deploy-benchmark.sh
+```
+
+## Individual Scripts
+
+Each script can be run independently, but they are designed to be called by the master script.
+
+### Step 0: `00-deploy-infra.sh`
+
+Deploys AWS infrastructure using Terraform (EKS cluster, node groups, VPC, etc.).
+
+**Requirements:**
+- Terraform installed (>= 1.0)
+- AWS CLI configured with appropriate credentials
+- AWS account with permissions to create EKS clusters, VPCs, EC2 instances, etc.
+
+**Environment Variables:**
+- `REGION` - AWS region (default: `us-west-2`)
+- `CLUSTER_NAME` - EKS cluster name (default: `fluss-eks-cluster`)
+- `AUTO_APPROVE` - Auto-approve terraform apply (default: `false`)
+
+**What it does:**
+1. Checks prerequisites (Terraform, AWS CLI, credentials)
+2. Validates terraform.tfvars exists (creates from example if missing)
+3. Initializes Terraform
+4. Validates Terraform configuration
+5. Creates Terraform plan
+6. Applies infrastructure (with confirmation unless AUTO_APPROVE=true)
+7. Waits for EKS cluster to be ACTIVE
+8. Waits for nodes to join the cluster
+9. Outputs cluster information and next steps
+
+**Configuration:**
+Before running, ensure `terraform.tfvars` is configured with:
+- `fluss_image_repository` - ECR repository URL for Fluss image
+- `demo_image_repository` - ECR repository URL for demo image
+- Other required variables (see `terraform.tfvars.example`)
+
+**Example:**
+```bash
+# Interactive (with confirmation)
+./00-deploy-infra.sh
+
+# Auto-approve (no confirmation prompt)
+AUTO_APPROVE=true ./00-deploy-infra.sh
+```
+
+### Step 1: `01-update-kubeconfig.sh`
+
+Updates kubeconfig to connect to the EKS cluster.
+
+**Requirements:**
+- AWS CLI configured
+- kubectl installed
+- EKS cluster exists
+
+**Environment Variables:**
+- `CLUSTER_NAME` - EKS cluster name (default: `fluss-eks-cluster`)
+- `REGION` - AWS region (default: `us-west-2`)
+
+### Step 2: `02-setup-storage.sh`
+
+Sets up local NVMe storage for Fluss tablet servers.
+
+**Requirements:**
+- kubectl configured
+- Storage setup script exists at `../storage/setup-local-storage.sh`
+
+**What it does:**
+- Creates `local-storage` StorageClass
+- Creates PersistentVolumes for tablet servers
+- Verifies storage setup
+
+### Step 3: `03-deploy-components.sh`
+
+Deploys all infrastructure components:
+- ZooKeeper
+- Fluss (Coordinator + Tablet Servers)
+- Flink cluster (JobManager + TaskManagers)
+- Monitoring stack (Prometheus + Grafana)
+- ServiceMonitors and PodMonitors
+
+**Requirements:**
+- kubectl configured
+- helm installed
+- `deploy.sh` script exists at `../deploy.sh`
+
+**Environment Variables:**
+- `NAMESPACE` - Kubernetes namespace (default: `fluss`)
+- `DEMO_IMAGE_TAG` - Demo image tag (default: `latest`)
+- `FLUSS_IMAGE_REPO` - Fluss image repository
+
+**Note:** This step skips producer deployment, which is handled separately in step 5.
+
+### Step 4: `04-verify-storage.sh`
+
+Verifies that NVMe storage is correctly configured for tablet servers.
+
+**Requirements:**
+- kubectl configured
+- Step 2 completed successfully
+
+**What it checks:**
+- PersistentVolumes exist and are bound
+- PVs are configured with NVMe paths (`/opt/alldata/fluss/data`)
+- Tablet server pods have volumes mounted correctly
+
+### Step 5: `05-deploy-producer.sh`
+
+Deploys the multi-instance producer (8 instances, 2 per node).
+
+**Requirements:**
+- kubectl configured
+- Step 3 completed successfully
+- `DEMO_IMAGE_REPO` environment variable set
+
+**Environment Variables:**
+- `NAMESPACE` - Kubernetes namespace (default: `fluss`)
+- `DEMO_IMAGE_REPO` - Demo image repository (**required**)
+- `DEMO_IMAGE_TAG` - Demo image tag (default: `latest`)
+- `BUCKETS` - Number of buckets (default: `128`)
+- `TOTAL_PRODUCERS` - Number of producer instances (default: `8`)
+- `PRODUCER_RATE` - Records per second per producer (default: `250000`)
+- `BOOTSTRAP` - Fluss coordinator address
+- `DATABASE` - Database name (default: `iot`)
+- `TABLE` - Table name (default: `sensor_readings`)
+
+**What it does:**
+1. Creates Fluss table with specified number of buckets
+2. Deploys multi-instance producer job
+3. Verifies producer pods are running
+
+### Step 6: `06-submit-flink-job.sh`
+
+Submits the Flink aggregator job.
+
+**Requirements:**
+- kubectl configured
+- Step 3 completed successfully
+- Flink JobManager pod running
+
+**Environment Variables:**
+- `NAMESPACE` - Kubernetes namespace (default: `fluss`)
+
+**What it does:**
+1. Verifies Flink JobManager is ready
+2. Submits Flink aggregator job via REST API
+3. Verifies job is running
+
+### Step 7: `07-deploy-dashboard.sh`
+
+Deploys Grafana dashboard for monitoring.
+
+**Requirements:**
+- kubectl configured
+- Step 3 completed successfully
+- Grafana pod running in monitoring namespace
+
+**Environment Variables:**
+- `NAMESPACE` - Kubernetes namespace (default: `fluss`)
+
+**What it does:**
+1. Deploys Grafana dashboard ConfigMap
+2. Imports dashboard via Grafana API (if possible)
+3. Verifies dashboard is available
+
+### Step 8: `08-verify-deployment.sh`
+
+Performs final verification of the deployment.
+
+**Requirements:**
+- kubectl configured
+- All previous steps completed
+
+**What it checks:**
+- All pods are running
+- Node placement is correct
+- ServiceMonitors and PodMonitors are deployed
+- Data flow is working (producer and Flink logs)
+
+## Error Handling
+
+The master script (`deploy-benchmark.sh`) provides comprehensive error handling:
+
+- **Step Failure Detection**: If any step fails, the script immediately stops and reports which step failed
+- **Clear Error Messages**: Each failure includes:
+  - Step number and description
+  - Exit code
+  - Instructions for retrying
+- **Retry Options**: Failed steps can be retried using:
+  - `--start-from-step N` - Retry from a specific step
+  - `--only-step N` - Retry only a specific step
+
+## Example Workflows
+
+### Full Deployment
+
+```bash
+# Step 0: Deploy infrastructure (if not already done)
+./00-deploy-infra.sh
+
+# Steps 1-8: Deploy all components
+export DEMO_IMAGE_REPO=your-repo/fluss-demo
+export FLUSS_IMAGE_REPO=your-repo/fluss
+./deploy-benchmark.sh
+```
+
+Or run everything including infrastructure:
+
+```bash
+export DEMO_IMAGE_REPO=your-repo/fluss-demo
+export FLUSS_IMAGE_REPO=your-repo/fluss
+./deploy-benchmark.sh  # Runs steps 0-8
+```
+
+### Deployment After Infrastructure is Ready
+
+If infrastructure is already deployed, start from step 1:
+
+```bash
+./deploy-benchmark.sh --start-from-step 1
+```
+
+If components are already deployed, start from step 5:
+
+```bash
+export DEMO_IMAGE_REPO=your-repo/fluss-demo
+./deploy-benchmark.sh --start-from-step 5
+```
+
+### Retry Failed Step
+
+If step 6 failed, retry only that step:
+
+```bash
+./deploy-benchmark.sh --only-step 6
+```
+
+### Skip Verification Steps
+
+Skip storage verification and final verification:
+
+```bash
+./deploy-benchmark.sh --skip-step 4 --skip-step 8
+```
+
+## Troubleshooting
+
+### Step Fails with "Script not found"
+
+Ensure you're running scripts from the `scripts/` directory or using absolute paths.
+
+### Step 5 Fails: "DEMO_IMAGE_REPO is not set"
+
+Set the environment variable:
+```bash
+export DEMO_IMAGE_REPO=your-repo/fluss-demo
+```
+
+### Step 3 Fails: "kubectl is not installed"
+
+Install kubectl and ensure it's in your PATH.
+
+### Step 3 Fails: "helm is not installed"
+
+Install helm:
+```bash
+# macOS
+brew install helm
+
+# Linux
+curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+```
+
+## Related Documentation
+
+- [DEPLOY-STEPS.md](../DEPLOY-STEPS.md) - Detailed deployment guide
+- [DEPLOYMENT.md](../DEPLOYMENT.md) - Kubernetes deployment guide
+- [instruction.md](../../instruction.md) - Deployment instructions overview
+
diff --git a/e2e-iot/high-infra/k8s/scripts/TERRAFORM_STATE_PRESERVATION.md b/e2e-iot/high-infra/k8s/scripts/TERRAFORM_STATE_PRESERVATION.md
new file mode 100644
index 0000000..143ea59
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/TERRAFORM_STATE_PRESERVATION.md
@@ -0,0 +1,257 @@
+# Terraform State Preservation in 00-deploy-infra.sh
+
+## Problem
+
+The `00-deploy-infra.sh` script was not preserving Terraform state, which could lead to:
+- Loss of infrastructure management capability if state file is deleted
+- Inability to track existing resources
+- Risk of orphaned resources or accidental recreation
+
+## Solution
+
+The script has been updated to:
+1. **Detect backend configuration** - Checks if remote backend is configured in `main.tf`
+2. **Backup local state** - Automatically backs up state file before and after operations
+3. **Warn about state preservation** - Provides clear warnings and instructions
+4. **Verify backend status** - Confirms backend configuration after `terraform init`
+
+## How It Works
+
+### 1. Pre-Init State Check
+
+Before running `terraform init`, the script:
+- Checks if backend block is configured in `main.tf`
+- If no backend is configured:
+  - Warns that state will be stored locally
+  - Creates a timestamped backup of existing state file (if it exists)
+  - Provides instructions for configuring remote backend
+
+### 2. Post-Init Verification
+
+After `terraform init`, the script:
+- Verifies if backend is actually being used
+- Checks `.terraform/terraform.tfstate` for backend configuration
+- Updates `USE_LOCAL_STATE` flag accordingly
+- Provides migration instructions if needed
+
+### 3. Post-Apply Backup
+
+After successful `terraform apply`:
+- If using local state, creates a timestamped backup
+- Backup filename format: `terraform.tfstate.backup.YYYYMMDD_HHMMSS`
+
+### 4. Final Reminder
+
+At the end of the script:
+- If using local state, displays important reminders:
+  - Location of state file
+  - Warning not to delete it
+  - Instructions for configuring remote backend
+
+## State File Locations
+
+### Local State (Default)
+- **State file**: `terraform/terraform.tfstate`
+- **Backups**: `terraform/terraform.tfstate.backup.*`
+- **Backend config**: `.terraform/terraform.tfstate` (if backend configured)
+
+### Remote State (S3 Backend)
+- **State file**: Stored in S3 bucket
+- **Local cache**: `.terraform/terraform.tfstate`
+- **Backend config**: `.terraform/terraform.tfstate`
+
+## Configuring Remote State
+
+To configure S3 backend for state preservation:
+
+### Step 1: Create S3 Bucket
+
+```bash
+aws s3 mb s3://your-terraform-state-bucket --region us-west-2
+aws s3api put-bucket-versioning \
+  --bucket your-terraform-state-bucket \
+  --versioning-configuration Status=Enabled
+```
+
+### Step 2: Update main.tf
+
+Uncomment and configure the backend block in `terraform/main.tf`:
+
+```terraform
+terraform {
+  # ... existing configuration ...
+
+  backend "s3" {
+    bucket = "your-terraform-state-bucket"
+    key    = "aws-deploy-fluss/terraform.tfstate"
+    region = "us-west-2"
+    
+    # Optional: Enable state locking with DynamoDB
+    # dynamodb_table = "terraform-state-lock"
+    # encrypt        = true
+  }
+}
+```
+
+### Step 3: Migrate Existing State
+
+If you have existing local state:
+
+```bash
+cd terraform
+terraform init -migrate-state
+```
+
+This will:
+1. Detect existing local state
+2. Prompt to migrate to S3 backend
+3. Copy state to S3
+4. Update local backend configuration
+
+## Backup Files
+
+The script creates backups with timestamps:
+- Format: `terraform.tfstate.backup.YYYYMMDD_HHMMSS`
+- Example: `terraform.tfstate.backup.20240115_143022`
+
+### Manual Backup
+
+You can also manually backup state:
+
+```bash
+cd terraform
+cp terraform.tfstate terraform.tfstate.backup.$(date +%Y%m%d_%H%M%S)
+```
+
+### Restore from Backup
+
+If state file is lost:
+
+```bash
+cd terraform
+cp terraform.tfstate.backup.YYYYMMDD_HHMMSS terraform.tfstate
+terraform init
+```
+
+## Best Practices
+
+1. **Use Remote Backend**: Configure S3 backend for production deployments
+2. **Enable Versioning**: Enable S3 bucket versioning for state files
+3. **Enable Encryption**: Use S3 server-side encryption
+4. **State Locking**: Use DynamoDB for state locking (prevents concurrent modifications)
+5. **Regular Backups**: Even with remote backend, keep periodic backups
+6. **Never Commit State**: Add `terraform.tfstate*` to `.gitignore`
+
+## Example .gitignore
+
+```gitignore
+# Terraform state files
+terraform.tfstate
+terraform.tfstate.*
+*.tfstate
+*.tfstate.backup
+
+# Terraform directories
+.terraform/
+.terraform.lock.hcl
+
+# Terraform plan files
+*.tfplan
+tfplan*
+```
+
+## Troubleshooting
+
+### State File Not Found
+
+If you see "state file not found" errors:
+
+1. Check if backups exist:
+   ```bash
+   ls -la terraform/terraform.tfstate.backup.*
+   ```
+
+2. Restore from backup (see above)
+
+3. If no backup exists, you may need to re-import resources
+
+### Backend Migration Issues
+
+If migration fails:
+
+1. Ensure S3 bucket exists and is accessible
+2. Check IAM permissions for S3 access
+3. Verify backend configuration in `main.tf`
+4. Try manual migration:
+   ```bash
+   terraform init -backend-config="bucket=your-bucket" -migrate-state
+   ```
+
+### Concurrent State Access
+
+If multiple users are running terraform:
+
+1. Configure DynamoDB table for state locking
+2. Use remote backend (S3 + DynamoDB)
+3. Coordinate deployments to avoid conflicts
+
+## Script Output Example
+
+When running the script with local state:
+
+```
+==========================================
+Checking Terraform State Configuration
+==========================================
+⚠ WARNING: No remote backend configured in main.tf
+  Terraform state will be stored locally in: .../terraform/terraform.tfstate
+  This state file is critical - if lost, Terraform cannot manage existing infrastructure
+
+  To preserve state, consider:
+    1. Configure S3 backend in main.tf (uncomment backend block)
+    2. Or backup terraform.tfstate file regularly
+
+  Creating backup of existing state file...
+  ✓ State backed up to: .../terraform.tfstate.backup.20240115_143022
+```
+
+After successful apply:
+
+```
+Backing up Terraform state after successful apply...
+✓ State backed up to: .../terraform.tfstate.backup.20240115_143045
+```
+
+At the end:
+
+```
+==========================================
+⚠ IMPORTANT: Terraform State Preservation
+==========================================
+
+Your Terraform state is stored locally at:
+  .../terraform/terraform.tfstate
+
+⚠ CRITICAL: This file is essential for managing your infrastructure!
+  - DO NOT delete this file
+  - DO NOT commit it to version control (contains sensitive data)
+  - Backup this file regularly
+
+To configure remote state storage (recommended):
+  1. Create an S3 bucket for Terraform state
+  2. Uncomment and configure the backend block in main.tf
+  3. Run: terraform init -migrate-state
+```
+
+## Summary
+
+The updated script now:
+- ✅ Detects backend configuration
+- ✅ Backs up state before operations
+- ✅ Backs up state after successful apply
+- ✅ Provides clear warnings and instructions
+- ✅ Verifies backend status after init
+- ✅ Reminds users about state preservation
+
+This ensures Terraform state is preserved and users are aware of the importance of state management.
+
diff --git a/e2e-iot/high-infra/k8s/scripts/deploy-benchmark.sh b/e2e-iot/high-infra/k8s/scripts/deploy-benchmark.sh
new file mode 100755
index 0000000..2f2971c
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/deploy-benchmark.sh
@@ -0,0 +1,327 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# Configuration
+NAMESPACE="${NAMESPACE:-fluss}"
+DEMO_IMAGE_REPO="${DEMO_IMAGE_REPO:-}"
+DEMO_IMAGE_TAG="${DEMO_IMAGE_TAG:-latest}"
+FLUSS_IMAGE_REPO="${FLUSS_IMAGE_REPO:-apache/fluss:0.8.0-incubating}"
+CLUSTER_NAME="${CLUSTER_NAME:-fluss-eks-cluster}"
+REGION="${REGION:-us-west-2}"
+
+# Export variables for child scripts
+export NAMESPACE DEMO_IMAGE_REPO DEMO_IMAGE_TAG FLUSS_IMAGE_REPO CLUSTER_NAME REGION
+
+# Script definitions (using functions for bash 3.2 compatibility)
+get_step_script() {
+    case $1 in
+        0) echo "00-deploy-infra.sh" ;;
+        1) echo "01-update-kubeconfig.sh" ;;
+        2) echo "02-setup-storage.sh" ;;
+        3) echo "03-deploy-components.sh" ;;
+        4) echo "04-verify-storage.sh" ;;
+        5) echo "05-deploy-producer.sh" ;;
+        6) echo "06-submit-flink-job.sh" ;;
+        7) echo "07-deploy-dashboard.sh" ;;
+        8) echo "08-verify-deployment.sh" ;;
+        *) echo "" ;;
+    esac
+}
+
+get_step_description() {
+    case $1 in
+        0) echo "Deploy infrastructure with Terraform" ;;
+        1) echo "Update kubeconfig" ;;
+        2) echo "Setup local NVMe storage" ;;
+        3) echo "Deploy all components" ;;
+        4) echo "Verify NVMe storage" ;;
+        5) echo "Deploy multi-instance producer" ;;
+        6) echo "Submit Flink aggregator job" ;;
+        7) echo "Deploy Grafana dashboard" ;;
+        8) echo "Verify deployment" ;;
+        *) echo "" ;;
+    esac
+}
+
+# List of all step numbers
+ALL_STEPS="0 1 2 3 4 5 6 7 8"
+
+# Function to run a step
+run_step() {
+    local step_num=$1
+    local script_name=$2
+    local step_desc=$3
+    
+    local script_path="${SCRIPT_DIR}/${script_name}"
+    
+    if [ ! -f "${script_path}" ]; then
+        echo "ERROR: Script not found: ${script_path}"
+        return 1
+    fi
+    
+    # Make script executable
+    chmod +x "${script_path}"
+    
+    echo ""
+    echo "=========================================="
+    echo "Running Step ${step_num}: ${step_desc}"
+    echo "=========================================="
+    echo ""
+    
+    # Run the script and capture exit code
+    if "${script_path}"; then
+        echo ""
+        echo "✓ Step ${step_num} completed successfully: ${step_desc}"
+        return 0
+    else
+        local exit_code=$?
+        echo ""
+        echo "✗ Step ${step_num} FAILED: ${step_desc}"
+        echo "  Exit code: ${exit_code}"
+        return ${exit_code}
+    fi
+}
+
+# Function to print usage
+usage() {
+    cat << EOF
+Usage: $0 [OPTIONS]
+
+Deploy the 2-million-messages-per-second benchmark infrastructure and components.
+
+OPTIONS:
+    --skip-step N          Skip step N (can be used multiple times)
+    --start-from-step N    Start from step N (skip previous steps)
+    --only-step N          Run only step N
+    --help                 Show this help message
+
+ENVIRONMENT VARIABLES:
+    NAMESPACE              Kubernetes namespace (default: fluss)
+    DEMO_IMAGE_REPO        Demo image repository (required for step 5)
+    DEMO_IMAGE_TAG         Demo image tag (default: latest)
+    FLUSS_IMAGE_REPO       Fluss image repository (default: apache/fluss:0.8.0-incubating)
+    CLUSTER_NAME           EKS cluster name (default: fluss-eks-cluster)
+    REGION                 AWS region (default: us-west-2)
+
+STEPS:
+    0. Deploy infrastructure with Terraform
+    1. Update kubeconfig
+    2. Setup local NVMe storage
+    3. Deploy all components (ZooKeeper, Fluss, Flink, Monitoring)
+    4. Verify NVMe storage
+    5. Deploy multi-instance producer
+    6. Submit Flink aggregator job
+    7. Deploy Grafana dashboard
+    8. Verify deployment
+
+EXAMPLE:
+    # Run all steps
+    ./deploy-benchmark.sh
+
+    # Skip step 4 (storage verification)
+    ./deploy-benchmark.sh --skip-step 4
+
+    # Start from step 5 (producer deployment)
+    ./deploy-benchmark.sh --start-from-step 5
+
+    # Run only step 3
+    ./deploy-benchmark.sh --only-step 3
+
+    # With custom image
+    DEMO_IMAGE_REPO=my-repo/fluss-demo ./deploy-benchmark.sh
+EOF
+}
+
+# Parse command line arguments
+SKIP_STEPS=()
+START_FROM_STEP=""
+ONLY_STEP=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --skip-step)
+            SKIP_STEPS+=("$2")
+            shift 2
+            ;;
+        --start-from-step)
+            START_FROM_STEP="$2"
+            shift 2
+            ;;
+        --only-step)
+            ONLY_STEP="$2"
+            shift 2
+            ;;
+        --help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "ERROR: Unknown option: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# Validate required environment variables for specific steps
+if [ -z "${DEMO_IMAGE_REPO}" ] && [ -z "${ONLY_STEP}" ]; then
+    echo "WARNING: DEMO_IMAGE_REPO is not set. Step 5 (producer deployment) will fail."
+    echo "Set it with: export DEMO_IMAGE_REPO=your-repo/fluss-demo"
+    echo ""
+fi
+
+# Main execution
+main() {
+    echo "=========================================="
+    echo "2 Million Messages Per Second Benchmark"
+    echo "Deployment Master Script"
+    echo "=========================================="
+    echo ""
+    echo "Configuration:"
+    echo "  Namespace: ${NAMESPACE}"
+    echo "  Demo Image: ${DEMO_IMAGE_REPO}:${DEMO_IMAGE_TAG}"
+    echo "  Fluss Image: ${FLUSS_IMAGE_REPO}"
+    echo "  Cluster: ${CLUSTER_NAME}"
+    echo "  Region: ${REGION}"
+    echo ""
+    
+    # Determine which steps to run
+    local steps_to_run=""
+    
+    if [ -n "${ONLY_STEP}" ]; then
+        # Run only specified step
+        if [ -z "$(get_step_script ${ONLY_STEP})" ]; then
+            echo "ERROR: Invalid step number: ${ONLY_STEP}"
+            echo "Valid steps: ${ALL_STEPS}"
+            exit 1
+        fi
+        steps_to_run="${ONLY_STEP}"
+    elif [ -n "${START_FROM_STEP}" ]; then
+        # Start from specified step
+        for step in ${ALL_STEPS}; do
+            if [ "${step}" -ge "${START_FROM_STEP}" ]; then
+                steps_to_run="${steps_to_run} ${step}"
+            fi
+        done
+        # Sort steps numerically (portable across all Unix systems)
+        steps_to_run=$(printf '%s\n' ${steps_to_run} | sort -n | tr '\n' ' ')
+    else
+        # Run all steps
+        steps_to_run="${ALL_STEPS}"
+    fi
+    
+    # Filter out skipped steps
+    local filtered_steps=""
+    for step in ${steps_to_run}; do
+        local skip=false
+        if [ ${#SKIP_STEPS[@]} -gt 0 ]; then
+            for skip_step in "${SKIP_STEPS[@]}"; do
+                if [ "${step}" = "${skip_step}" ]; then
+                    skip=true
+                    break
+                fi
+            done
+        fi
+        if [ "${skip}" = false ]; then
+            filtered_steps="${filtered_steps} ${step}"
+        fi
+    done
+    
+    filtered_steps=$(echo ${filtered_steps} | sed 's/^ *//;s/ *$//')
+    
+    if [ -z "${filtered_steps}" ]; then
+        echo "ERROR: No steps to run after filtering"
+        exit 1
+    fi
+    
+    echo "Steps to execute: ${filtered_steps}"
+    echo ""
+    
+    # Run each step
+    local failed_steps=""
+    for step in ${filtered_steps}; do
+        local script_name=$(get_step_script ${step})
+        local step_desc=$(get_step_description ${step})
+        
+        if ! run_step "${step}" "${script_name}" "${step_desc}"; then
+            failed_steps="${failed_steps} ${step}: ${step_desc}"
+            echo ""
+            echo "=========================================="
+            echo "DEPLOYMENT FAILED"
+            echo "=========================================="
+            echo ""
+            echo "Failed at Step ${step}: ${step_desc}"
+            echo ""
+            if [ -n "${failed_steps}" ]; then
+                echo "Failed steps:"
+                for failed in ${failed_steps}; do
+                    echo "  - Step ${failed}"
+                done
+            fi
+            echo ""
+            echo "To retry from this step:"
+            echo "  ./deploy-benchmark.sh --start-from-step ${step}"
+            echo ""
+            echo "To retry only this step:"
+            echo "  ./deploy-benchmark.sh --only-step ${step}"
+            echo ""
+            exit 1
+        fi
+    done
+    
+    # Success summary
+    echo ""
+    echo "=========================================="
+    echo "DEPLOYMENT COMPLETED SUCCESSFULLY"
+    echo "=========================================="
+    echo ""
+    echo "All steps completed:"
+    for step in ${filtered_steps}; do
+        local step_desc=$(get_step_description ${step})
+        echo "  ✓ Step ${step}: ${step_desc}"
+    done
+    echo ""
+    echo "Access services:"
+    echo "  Flink Web UI:"
+    echo "    kubectl port-forward -n ${NAMESPACE} svc/flink-jobmanager 8081:8081"
+    echo "    Then open: http://localhost:8081"
+    echo ""
+    echo "  Grafana:"
+    echo "    kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80"
+    echo "    Then open: http://localhost:3000"
+    echo "    Username: admin"
+    echo "    Password: admin123"
+    echo ""
+    echo "  Prometheus:"
+    echo "    kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090"
+    echo "    Then open: http://localhost:9090"
+    echo ""
+    echo "Check status:"
+    echo "  kubectl get pods -n ${NAMESPACE}"
+    echo "  kubectl get pods -n monitoring"
+    echo ""
+}
+
+# Run main function
+main
+
diff --git a/e2e-iot/high-infra/k8s/scripts/test-scripts.sh b/e2e-iot/high-infra/k8s/scripts/test-scripts.sh
new file mode 100755
index 0000000..3355566
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/scripts/test-scripts.sh
@@ -0,0 +1,184 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+echo "=========================================="
+echo "Script Validation Test"
+echo "=========================================="
+echo ""
+
+ERRORS=0
+WARNINGS=0
+
+# Test 1: Check all scripts exist
+echo "Test 1: Checking all scripts exist..."
+for script in 00-deploy-infra.sh 01-update-kubeconfig.sh 02-setup-storage.sh 03-deploy-components.sh 04-verify-storage.sh 05-deploy-producer.sh 06-submit-flink-job.sh 07-deploy-dashboard.sh 08-verify-deployment.sh deploy-benchmark.sh; do
+    if [ -f "${SCRIPT_DIR}/${script}" ]; then
+        echo "  ✓ ${script}"
+    else
+        echo "  ✗ ${script} MISSING"
+        ERRORS=$((ERRORS + 1))
+    fi
+done
+echo ""
+
+# Test 2: Check syntax of all scripts
+echo "Test 2: Checking script syntax..."
+for script in 00-deploy-infra.sh 01-update-kubeconfig.sh 02-setup-storage.sh 03-deploy-components.sh 04-verify-storage.sh 05-deploy-producer.sh 06-submit-flink-job.sh 07-deploy-dashboard.sh 08-verify-deployment.sh deploy-benchmark.sh; do
+    if bash -n "${SCRIPT_DIR}/${script}" 2>&1; then
+        echo "  ✓ ${script}"
+    else
+        echo "  ✗ ${script} has syntax errors"
+        ERRORS=$((ERRORS + 1))
+    fi
+done
+echo ""
+
+# Test 3: Check scripts are executable
+echo "Test 3: Checking scripts are executable..."
+for script in 00-deploy-infra.sh 01-update-kubeconfig.sh 02-setup-storage.sh 03-deploy-components.sh 04-verify-storage.sh 05-deploy-producer.sh 06-submit-flink-job.sh 07-deploy-dashboard.sh 08-verify-deployment.sh deploy-benchmark.sh; do
+    if [ -x "${SCRIPT_DIR}/${script}" ]; then
+        echo "  ✓ ${script}"
+    else
+        echo "  ⚠ ${script} is not executable (fixing...)"
+        chmod +x "${SCRIPT_DIR}/${script}"
+        WARNINGS=$((WARNINGS + 1))
+    fi
+done
+echo ""
+
+# Test 4: Check master script help works
+echo "Test 4: Testing master script help..."
+if bash "${SCRIPT_DIR}/deploy-benchmark.sh" --help 2>&1 | grep -q "Usage:"; then
+    echo "  ✓ deploy-benchmark.sh --help works"
+else
+    echo "  ✗ deploy-benchmark.sh --help failed"
+    ERRORS=$((ERRORS + 1))
+fi
+echo ""
+
+# Test 5: Check file references in scripts
+echo "Test 5: Checking file references..."
+echo "  Checking 02-setup-storage.sh references..."
+if grep -q "setup-local-storage.sh" "${SCRIPT_DIR}/02-setup-storage.sh"; then
+    if [ -f "${K8S_DIR}/storage/setup-local-storage.sh" ]; then
+        echo "  ✓ storage/setup-local-storage.sh exists"
+    else
+        echo "  ⚠ storage/setup-local-storage.sh not found (may be OK if not using storage)"
+        WARNINGS=$((WARNINGS + 1))
+    fi
+fi
+
+echo "  Checking 03-deploy-components.sh references..."
+if grep -q "deploy.sh" "${SCRIPT_DIR}/03-deploy-components.sh"; then
+    if [ -f "${K8S_DIR}/deploy.sh" ]; then
+        echo "  ✓ deploy.sh exists"
+    else
+        echo "  ✗ deploy.sh not found"
+        ERRORS=$((ERRORS + 1))
+    fi
+fi
+
+echo "  Checking 05-deploy-producer.sh references..."
+if grep -q "create-table.sh" "${SCRIPT_DIR}/05-deploy-producer.sh"; then
+    if [ -f "${K8S_DIR}/jobs/create-table.sh" ]; then
+        echo "  ✓ jobs/create-table.sh exists"
+    else
+        echo "  ✗ jobs/create-table.sh not found"
+        ERRORS=$((ERRORS + 1))
+    fi
+fi
+
+if grep -q "deploy-producer-multi-instance.sh" "${SCRIPT_DIR}/05-deploy-producer.sh"; then
+    if [ -f "${K8S_DIR}/jobs/deploy-producer-multi-instance.sh" ]; then
+        echo "  ✓ jobs/deploy-producer-multi-instance.sh exists"
+    else
+        echo "  ✗ jobs/deploy-producer-multi-instance.sh not found"
+        ERRORS=$((ERRORS + 1))
+    fi
+fi
+
+echo "  Checking 06-submit-flink-job.sh references..."
+if grep -q "submit-job-from-image.sh" "${SCRIPT_DIR}/06-submit-flink-job.sh"; then
+    if [ -f "${K8S_DIR}/flink/submit-job-from-image.sh" ]; then
+        echo "  ✓ flink/submit-job-from-image.sh exists"
+    else
+        echo "  ✗ flink/submit-job-from-image.sh not found"
+        ERRORS=$((ERRORS + 1))
+    fi
+fi
+
+echo "  Checking 07-deploy-dashboard.sh references..."
+if grep -q "deploy-dashboard.sh\|grafana-dashboard.yaml" "${SCRIPT_DIR}/07-deploy-dashboard.sh"; then
+    if [ -f "${K8S_DIR}/monitoring/deploy-dashboard.sh" ] || [ -f "${K8S_DIR}/monitoring/grafana-dashboard.yaml" ]; then
+        echo "  ✓ monitoring dashboard files exist"
+    else
+        echo "  ⚠ monitoring dashboard files not found (may be OK)"
+        WARNINGS=$((WARNINGS + 1))
+    fi
+fi
+echo ""
+
+# Test 6: Check master script step validation
+echo "Test 6: Testing master script step validation..."
+if bash "${SCRIPT_DIR}/deploy-benchmark.sh" --only-step 99 2>&1 | grep -q "Invalid step number\|ERROR"; then
+    echo "  ✓ Invalid step number validation works"
+else
+    echo "  ⚠ Step validation may not be working correctly"
+    WARNINGS=$((WARNINGS + 1))
+fi
+echo ""
+
+# Test 7: Check environment variable handling
+echo "Test 7: Testing environment variable defaults..."
+export NAMESPACE=""
+export DEMO_IMAGE_REPO=""
+export CLUSTER_NAME=""
+export REGION=""
+if bash -c 'source "${SCRIPT_DIR}/deploy-benchmark.sh" 2>/dev/null; echo "NAMESPACE=${NAMESPACE:-fluss} CLUSTER=${CLUSTER_NAME:-fluss-eks-cluster}"' 2>&1 | grep -q "fluss"; then
+    echo "  ✓ Environment variable defaults work"
+else
+    echo "  ⚠ Environment variable defaults may not work correctly"
+    WARNINGS=$((WARNINGS + 1))
+fi
+echo ""
+
+# Summary
+echo "=========================================="
+echo "Validation Summary"
+echo "=========================================="
+echo "Errors: ${ERRORS}"
+echo "Warnings: ${WARNINGS}"
+echo ""
+
+if [ ${ERRORS} -eq 0 ]; then
+    echo "✓ All critical tests passed!"
+    if [ ${WARNINGS} -gt 0 ]; then
+        echo "⚠ Some warnings found (see above)"
+    fi
+    exit 0
+else
+    echo "✗ Some tests failed. Please fix the errors above."
+    exit 1
+fi
+
+
diff --git a/e2e-iot/high-infra/k8s/storage/README.md b/e2e-iot/high-infra/k8s/storage/README.md
new file mode 100644
index 0000000..315310f
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/storage/README.md
@@ -0,0 +1,174 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+
+# Local NVMe Storage Setup for Fluss Tablet Servers
+
+This directory contains the configuration and scripts needed to set up local NVMe storage for Fluss tablet servers, similar to how Pulsar handles local storage.
+
+## Overview
+
+The setup uses:
+- **StorageClass**: `local-storage` with `no-provisioner` (requires manual PV creation)
+- **PersistentVolumes**: Manually created PVs that reference local paths on tablet server nodes
+- **Node Setup**: NVMe drives are automatically formatted and mounted by Terraform's `pre_bootstrap_user_data` script
+
+## Prerequisites
+
+1. **Terraform Infrastructure**: The tablet server nodes must be deployed with NVMe drives mounted at:
+   - `/mnt/fluss-tablet-data/fluss/data` (primary data storage)
+
+2. **Node Labels**: Tablet server nodes must have:
+   - `fluss-component: tablet-server`
+   - `storage-type: nvme`
+
+## Setup Steps
+
+### 1. Create StorageClass and PersistentVolumes
+
+Run the setup script:
+
+```bash
+cd /path/to/aws-deploy-fluss/high-infra/k8s/storage
+export NAMESPACE=fluss
+export TABLET_REPLICAS=3  # Match your tablet server replica count
+export STORAGE_SIZE=500Gi  # Adjust based on your NVMe drive size
+
+./setup-local-storage.sh
+```
+
+This script will:
+- Create the `local-storage` StorageClass
+- Create PersistentVolumes for each tablet server replica
+- Clean up any existing PVs before creating new ones
+
+### 2. Verify Setup
+
+```bash
+# Check StorageClass
+kubectl get storageclass local-storage
+
+# Check PersistentVolumes
+kubectl get pv -l component=tablet-server
+
+# Verify PVs are in Available state
+kubectl get pv -l component=tablet-server -o wide
+```
+
+### 3. Deploy Fluss with Local Storage
+
+Update your Helm values (`helm-charts/fluss-values.yaml`) or deployment script to use:
+
+```yaml
+persistence:
+  enabled: true
+  storageClass: local-storage
+  size: 500Gi
+  local_storage: true
+```
+
+Or set environment variables when deploying:
+
+```bash
+export enable_persistence=true
+export storage_class=local-storage
+export storage_size=500Gi
+export local_storage=true
+```
+
+### 4. Deploy Fluss
+
+Deploy Fluss using your normal deployment process. The StatefulSet will create PVCs that bind to the manually created PVs.
+
+## How It Works
+
+1. **Terraform Setup**: When tablet server nodes are created, the `pre_bootstrap_user_data` script:
+   - Formats NVMe drives (`/dev/nvme1n1`, `/dev/nvme2n1`)
+   - Mounts them to `/mnt/fluss-tablet-data` and `/mnt/fluss-tablet-logs`
+   - Creates directory structure with proper permissions
+
+2. **Kubernetes Storage**: 
+   - StorageClass `local-storage` uses `no-provisioner` (manual PV creation)
+   - PersistentVolumes are created manually, each referencing a local path on a specific node
+   - PVs have node affinity to ensure they only bind to tablet server nodes
+
+3. **Pod Binding**:
+   - When a Fluss tablet server pod is created, it requests a PVC
+   - The PVC binds to an available PV based on node affinity
+   - The pod mounts the local NVMe storage at `/tmp/fluss/data`
+
+## Files
+
+- `local-storage-class.yaml`: StorageClass definition
+- `create-local-pvs.sh`: Script to generate PersistentVolumes
+- `setup-local-storage.sh`: Main setup script (runs everything)
+
+## Troubleshooting
+
+### PVs Not Binding
+
+If PVCs remain in `Pending` state:
+
+1. Check PV node affinity matches your nodes:
+   ```bash
+   kubectl get pv fluss-tablet-data-0 -o yaml | grep -A 10 nodeAffinity
+   ```
+
+2. Verify node labels:
+   ```bash
+   kubectl get nodes -l fluss-component=tablet-server --show-labels
+   ```
+
+3. Check if PVs are in `Available` state:
+   ```bash
+   kubectl get pv -l component=tablet-server
+   ```
+
+### Storage Not Available
+
+If pods can't access storage:
+
+1. Verify NVMe drives are mounted on nodes:
+   ```bash
+   kubectl debug node/<tablet-server-node> -it --image=busybox -- df -h /mnt/fluss-tablet-data
+   ```
+
+2. Check directory permissions:
+   ```bash
+   kubectl debug node/<tablet-server-node> -it --image=busybox -- ls -la /mnt/fluss-tablet-data
+   ```
+
+### Recreating PVs
+
+To recreate PVs (e.g., after changing replica count):
+
+```bash
+# Delete existing PVs
+kubectl delete pv -l component=tablet-server,type=local-nvme
+
+# Recreate with new settings
+export TABLET_REPLICAS=5
+./setup-local-storage.sh
+```
+
+## Notes
+
+- **Reclaim Policy**: PVs use `Retain` policy to prevent accidental data loss
+- **Volume Binding**: Uses `WaitForFirstConsumer` to ensure pods are scheduled on correct nodes
+- **Storage Size**: Adjust `STORAGE_SIZE` based on your actual NVMe drive capacity
+- **Replica Count**: Ensure `TABLET_REPLICAS` matches your Fluss tablet server replica count
+
diff --git a/e2e-iot/high-infra/k8s/storage/create-local-pvs.sh b/e2e-iot/high-infra/k8s/storage/create-local-pvs.sh
new file mode 100755
index 0000000..c83d7ed
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/storage/create-local-pvs.sh
@@ -0,0 +1,72 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+# Script to create PersistentVolumes for Fluss tablet servers using local NVMe storage
+# This script generates PVs based on the number of tablet server replicas
+
+set -e
+
+NAMESPACE="${NAMESPACE:-fluss}"
+TABLET_REPLICAS="${TABLET_REPLICAS:-3}"
+STORAGE_SIZE="${STORAGE_SIZE:-500Gi}"
+
+echo "Creating PersistentVolumes for Fluss tablet servers..."
+echo "  Namespace: ${NAMESPACE}"
+echo "  Replicas: ${TABLET_REPLICAS}"
+echo "  Storage Size: ${STORAGE_SIZE}"
+
+for i in $(seq 0 $((TABLET_REPLICAS - 1))); do
+  cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: fluss-tablet-data-${i}
+  labels:
+    type: local-nvme
+    component: tablet-server
+    storage-type: data
+    app: fluss
+spec:
+  capacity:
+    storage: ${STORAGE_SIZE}
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: local-storage
+  local:
+    path: /opt/alldata/fluss
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+        - matchExpressions:
+            - key: fluss-component
+              operator: In
+              values:
+                - tablet-server
+            - key: storage-type
+              operator: In
+              values:
+                - nvme
+  persistentVolumeReclaimPolicy: Retain
+EOF
+  echo "  ✓ Created PV: fluss-tablet-data-${i}"
+done
+
+echo ""
+echo "PersistentVolumes created successfully!"
+echo "Verify with: kubectl get pv -l component=tablet-server"
+
diff --git a/e2e-iot/high-infra/k8s/storage/local-storage-class.yaml b/e2e-iot/high-infra/k8s/storage/local-storage-class.yaml
new file mode 100644
index 0000000..d15b2e6
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/storage/local-storage-class.yaml
@@ -0,0 +1,30 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# StorageClass for local NVMe storage
+# This uses the no-provisioner which requires manual PersistentVolume creation
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: local-storage
+  annotations:
+    storageclass.kubernetes.io/is-default-class: "false"
+provisioner: kubernetes.io/no-provisioner
+volumeBindingMode: WaitForFirstConsumer
+reclaimPolicy: Delete
+allowVolumeExpansion: false
+
diff --git a/e2e-iot/high-infra/k8s/storage/setup-local-storage.sh b/e2e-iot/high-infra/k8s/storage/setup-local-storage.sh
new file mode 100755
index 0000000..955f671
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/storage/setup-local-storage.sh
@@ -0,0 +1,64 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+# Setup script for local NVMe storage for Fluss tablet servers
+# This script creates the StorageClass and PersistentVolumes needed for local storage
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+NAMESPACE="${NAMESPACE:-fluss}"
+TABLET_REPLICAS="${TABLET_REPLICAS:-3}"
+STORAGE_SIZE="${STORAGE_SIZE:-500Gi}"
+
+echo "=========================================="
+echo "Setting up Local NVMe Storage for Fluss"
+echo "=========================================="
+echo ""
+
+# Step 1: Create StorageClass
+echo "[1/3] Creating StorageClass for local storage..."
+kubectl apply -f "${SCRIPT_DIR}/local-storage-class.yaml"
+echo "  ✓ StorageClass 'local-storage' created"
+echo ""
+
+# Step 2: Delete existing PVs if they exist (for idempotency)
+echo "[2/3] Cleaning up existing PersistentVolumes (if any)..."
+kubectl delete pv -l component=tablet-server,type=local-nvme --ignore-not-found=true
+echo "  ✓ Cleanup completed"
+echo ""
+
+# Step 3: Create PersistentVolumes
+echo "[3/3] Creating PersistentVolumes for ${TABLET_REPLICAS} tablet server(s)..."
+export NAMESPACE TABLET_REPLICAS STORAGE_SIZE
+"${SCRIPT_DIR}/create-local-pvs.sh"
+echo ""
+
+echo "=========================================="
+echo "Local Storage Setup Complete!"
+echo "=========================================="
+echo ""
+echo "Next steps:"
+echo "  1. Verify StorageClass: kubectl get storageclass local-storage"
+echo "  2. Verify PVs: kubectl get pv -l component=tablet-server"
+echo "  3. Deploy Fluss with:"
+echo "     - persistence.enabled: true"
+echo "     - persistence.storageClass: local-storage"
+echo "     - persistence.size: ${STORAGE_SIZE}"
+echo ""
+
diff --git a/e2e-iot/high-infra/k8s/zookeeper/zookeeper.yaml b/e2e-iot/high-infra/k8s/zookeeper/zookeeper.yaml
new file mode 100644
index 0000000..6439452
--- /dev/null
+++ b/e2e-iot/high-infra/k8s/zookeeper/zookeeper.yaml
@@ -0,0 +1,75 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: zk-svc
+  namespace: fluss
+  labels:
+    app: zookeeper
+spec:
+  selector:
+    app: zookeeper
+  ports:
+    - name: client
+      port: 2181
+      targetPort: 2181
+  clusterIP: None  # Headless service
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: zk
+  namespace: fluss
+  labels:
+    app: zookeeper
+spec:
+  serviceName: zk-svc
+  replicas: 1
+  selector:
+    matchLabels:
+      app: zookeeper
+  template:
+    metadata:
+      labels:
+        app: zookeeper
+    spec:
+      containers:
+        - name: zookeeper
+          image: zookeeper:3.9.2
+          ports:
+            - name: client
+              containerPort: 2181
+          resources:
+            requests:
+              cpu: "200m"
+              memory: "256Mi"
+            limits:
+              cpu: "500m"
+              memory: "512Mi"
+          livenessProbe:
+            tcpSocket:
+              port: 2181
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          readinessProbe:
+            tcpSocket:
+              port: 2181
+            initialDelaySeconds: 10
+            periodSeconds: 5
+
diff --git a/e2e-iot/high-infra/push-images-to-ecr.sh b/e2e-iot/high-infra/push-images-to-ecr.sh
new file mode 100755
index 0000000..6d61b17
--- /dev/null
+++ b/e2e-iot/high-infra/push-images-to-ecr.sh
@@ -0,0 +1,252 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -euo pipefail
+
+# Script to build and push images to ECR:
+# 1. fluss-demo (for producer and flink aggregator)
+# 2. fluss (Apache Fluss image)
+#
+# Usage:
+#   ./high-infra/push-images-to-ecr.sh --all              # Push both images
+#   ./high-infra/push-images-to-ecr.sh --producer-only    # Push only producer image
+#   ./high-infra/push-images-to-ecr.sh --fluss-only       # Push only Fluss image
+#
+# IMPORTANT: This script must be run from the 2-million-messages-per-second directory
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# Get the 2-million-messages-per-second directory (parent of high-infra)
+BASE_DIR=$(cd "${SCRIPT_DIR}/.." && pwd)
+DEMO_DIR="${BASE_DIR}/fluss_flink_realtime"
+AWS_REGION=${AWS_REGION:-us-west-2}
+FLUSS_VERSION=${FLUSS_VERSION:-0.8.0-incubating}
+ECR_INFO_FILE="${BASE_DIR}/ecr-repositories.txt"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+# Validate we're in the correct directory structure
+if [ ! -d "${DEMO_DIR}" ]; then
+    echo -e "${RED}Error: Cannot find fluss_flink_realtime directory${NC}"
+    echo -e "${RED}Expected: ${DEMO_DIR}${NC}"
+    echo -e "${RED}Please run this script from the 2-million-messages-per-second directory${NC}"
+    exit 1
+fi
+
+# Validate we're running from 2-million-messages-per-second directory
+EXPECTED_BASE_NAME="2-million-messages-per-second"
+ACTUAL_BASE_NAME=$(basename "${BASE_DIR}")
+if [ "${ACTUAL_BASE_NAME}" != "${EXPECTED_BASE_NAME}" ]; then
+    echo -e "${RED}Error: Script must be run from the ${EXPECTED_BASE_NAME} directory${NC}"
+    echo -e "${RED}Current directory: ${BASE_DIR}${NC}"
+    echo -e "${RED}Please run: cd ${EXPECTED_BASE_NAME} && ./high-infra/push-images-to-ecr.sh${NC}"
+    exit 1
+fi
+
+# Parse command line arguments
+PUSH_DEMO=false
+PUSH_FLUSS=false
+
+case "${1:-}" in
+    --all)
+        PUSH_DEMO=true
+        PUSH_FLUSS=true
+        ;;
+    --producer-only)
+        PUSH_DEMO=true
+        PUSH_FLUSS=false
+        ;;
+    --fluss-only)
+        PUSH_DEMO=false
+        PUSH_FLUSS=true
+        ;;
+    *)
+        echo -e "${RED}Error: Missing or invalid argument${NC}"
+        echo -e "Usage:"
+        echo -e "  $0 --all            # Push both images"
+        echo -e "  $0 --producer-only  # Push only producer image"
+        echo -e "  $0 --fluss-only     # Push only Fluss image"
+        exit 1
+        ;;
+esac
+
+echo -e "${GREEN}=== Building and Pushing Images to ECR ===${NC}\n"
+if [ "$PUSH_DEMO" = true ] && [ "$PUSH_FLUSS" = true ]; then
+    echo -e "${YELLOW}Mode: Push both producer and Fluss images${NC}\n"
+elif [ "$PUSH_DEMO" = true ]; then
+    echo -e "${YELLOW}Mode: Push only producer image${NC}\n"
+elif [ "$PUSH_FLUSS" = true ]; then
+    echo -e "${YELLOW}Mode: Push only Fluss image${NC}\n"
+fi
+
+# Get AWS account ID
+AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
+if [ -z "$AWS_ACCOUNT_ID" ]; then
+    echo -e "${RED}Error: Unable to get AWS account ID. Is AWS CLI configured?${NC}"
+    exit 1
+fi
+
+ECR_BASE="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com"
+DEMO_REPO="${ECR_BASE}/fluss-demo"
+FLUSS_REPO="${ECR_BASE}/fluss"
+
+echo -e "${YELLOW}AWS Account ID: ${AWS_ACCOUNT_ID}${NC}"
+echo -e "${YELLOW}AWS Region: ${AWS_REGION}${NC}"
+echo -e "${YELLOW}Demo Repository: ${DEMO_REPO}${NC}"
+echo -e "${YELLOW}Fluss Repository: ${FLUSS_REPO}${NC}\n"
+
+# Login to ECR
+echo -e "${YELLOW}[1/5] Logging in to ECR...${NC}"
+aws ecr get-login-password --region "${AWS_REGION}" | \
+    docker login --username AWS --password-stdin "${ECR_BASE}"
+echo -e "${GREEN}✓ Logged in to ECR${NC}\n"
+
+# Ensure ECR repositories exist (they should be created by Terraform)
+echo -e "${YELLOW}[2/5] Checking ECR repositories...${NC}"
+if ! aws ecr describe-repositories --repository-names fluss-demo --region "${AWS_REGION}" >/dev/null 2>&1; then
+    echo -e "${YELLOW}Creating fluss-demo repository...${NC}"
+    aws ecr create-repository --repository-name fluss-demo --region "${AWS_REGION}" >/dev/null
+fi
+if ! aws ecr describe-repositories --repository-names fluss --region "${AWS_REGION}" >/dev/null 2>&1; then
+    echo -e "${YELLOW}Creating fluss repository...${NC}"
+    aws ecr create-repository --repository-name fluss --region "${AWS_REGION}" >/dev/null
+fi
+echo -e "${GREEN}✓ ECR repositories ready${NC}\n"
+
+# Build and push producer application image
+if [ "$PUSH_DEMO" = true ]; then
+    echo -e "${YELLOW}[3/5] Building producer application image...${NC}"
+    echo -e "${YELLOW}Step 1: Building JAR from source (clean build)...${NC}"
+    cd "${DEMO_DIR}"
+    mvn clean package
+    JAR_FILE=$(find "${DEMO_DIR}/target" -name "fluss-flink-realtime-demo*.jar" -type f 2>/dev/null | head -1)
+    if [ -z "${JAR_FILE}" ] || [ ! -f "${JAR_FILE}" ]; then
+        echo -e "${RED}Error: JAR file not found after build${NC}"
+        exit 1
+    fi
+    echo -e "${GREEN}✓ JAR built successfully: ${JAR_FILE}${NC}"
+    echo ""
+
+    cd "${DEMO_DIR}"
+    echo -e "${YELLOW}Building Docker image for linux/amd64...${NC}"
+    docker build --platform linux/amd64 -t fluss-demo:latest .
+    TIMESTAMP=$(date +%Y%m%d-%H%M%S)
+    docker tag fluss-demo:latest "${DEMO_REPO}:latest"
+    docker tag fluss-demo:latest "${DEMO_REPO}:${TIMESTAMP}"
+
+    echo -e "${YELLOW}Pushing producer image to ECR...${NC}"
+    docker push "${DEMO_REPO}:latest"
+    docker push "${DEMO_REPO}:${TIMESTAMP}"
+    echo -e "${GREEN}✓ Producer image pushed to ${DEMO_REPO}${NC}\n"
+else
+    echo -e "${YELLOW}[3/5] Skipping producer image (not requested)${NC}\n"
+fi
+
+# Pull, tag, and push Fluss image
+if [ "$PUSH_FLUSS" = true ]; then
+    echo -e "${YELLOW}[4/5] Pulling Apache Fluss image from Docker Hub (linux/amd64)...${NC}"
+    FLUSS_IMAGE="apache/fluss:${FLUSS_VERSION}"
+    docker pull --platform linux/amd64 "${FLUSS_IMAGE}"
+    echo -e "${GREEN}✓ Fluss image pulled${NC}"
+
+    echo -e "${YELLOW}Tagging Fluss image for ECR...${NC}"
+    docker tag "${FLUSS_IMAGE}" "${FLUSS_REPO}:${FLUSS_VERSION}"
+    docker tag "${FLUSS_IMAGE}" "${FLUSS_REPO}:latest"
+
+    echo -e "${YELLOW}Pushing Fluss image to ECR...${NC}"
+    docker push "${FLUSS_REPO}:${FLUSS_VERSION}"
+    docker push "${FLUSS_REPO}:latest"
+    echo -e "${GREEN}✓ Fluss image pushed to ${FLUSS_REPO}${NC}\n"
+else
+    echo -e "${YELLOW}[4/5] Skipping Fluss image (not requested)${NC}\n"
+fi
+
+# Summary
+echo -e "${GREEN}=== Image Push Complete ===${NC}\n"
+echo -e "Images pushed:"
+if [ "$PUSH_DEMO" = true ]; then
+    echo -e "  ${DEMO_REPO}:latest"
+fi
+if [ "$PUSH_FLUSS" = true ]; then
+    echo -e "  ${FLUSS_REPO}:${FLUSS_VERSION}"
+    echo -e "  ${FLUSS_REPO}:latest"
+fi
+echo -e ""
+
+# Save ECR repository details to file
+echo -e "${YELLOW}[5/5] Saving ECR repository details to ${ECR_INFO_FILE}...${NC}"
+cat > "${ECR_INFO_FILE}" << EOF
+# ECR Repository Details
+# Generated on: $(date)
+# AWS Account ID: ${AWS_ACCOUNT_ID}
+# AWS Region: ${AWS_REGION}
+
+EOF
+
+if [ "$PUSH_DEMO" = true ]; then
+    cat >> "${ECR_INFO_FILE}" << EOF
+# Demo/Producer Image Repository
+DEMO_IMAGE_REPOSITORY="${DEMO_REPO}"
+DEMO_IMAGE_TAG="latest"
+
+# For terraform.tfvars:
+demo_image_repository = "${DEMO_REPO}"
+
+EOF
+fi
+
+if [ "$PUSH_FLUSS" = true ]; then
+    cat >> "${ECR_INFO_FILE}" << EOF
+# Fluss Image Repository
+FLUSS_IMAGE_REPOSITORY="${FLUSS_REPO}"
+FLUSS_IMAGE_VERSION="${FLUSS_VERSION}"
+
+# For terraform.tfvars:
+fluss_image_repository = "${FLUSS_REPO}"
+use_ecr_for_fluss = true
+
+EOF
+fi
+
+cat >> "${ECR_INFO_FILE}" << EOF
+# Full ECR Base URL
+ECR_BASE="${ECR_BASE}"
+
+# To use these values in shell scripts:
+# source ${ECR_INFO_FILE}
+# echo \${DEMO_IMAGE_REPOSITORY}
+EOF
+
+echo -e "${GREEN}✓ ECR repository details saved to ${ECR_INFO_FILE}${NC}"
+echo -e ""
+echo -e "To use these values:"
+echo -e "  source ${ECR_INFO_FILE}"
+echo -e ""
+echo -e "Or update terraform.tfvars with:"
+if [ "$PUSH_DEMO" = true ]; then
+    echo -e "  demo_image_repository = \"${DEMO_REPO}\""
+fi
+if [ "$PUSH_FLUSS" = true ]; then
+    echo -e "  fluss_image_repository = \"${FLUSS_REPO}\""
+    echo -e "  use_ecr_for_fluss = true"
+fi
+echo -e ""
+
diff --git a/e2e-iot/high-infra/terraform/TERRAFORM_PLAN_ERROR_FIX.md b/e2e-iot/high-infra/terraform/TERRAFORM_PLAN_ERROR_FIX.md
new file mode 100644
index 0000000..03a2248
--- /dev/null
+++ b/e2e-iot/high-infra/terraform/TERRAFORM_PLAN_ERROR_FIX.md
@@ -0,0 +1,106 @@
+# Terraform Plan Error Fix - Kubernetes Provider Connection
+
+## Issue
+
+When running `terraform plan` for the first time (before EKS cluster exists), you may see errors like:
+
+```
+Error: Get "http://localhost/api/v1/namespaces/fluss": dial tcp [::1]:80: connect: connection refused
+Error: Get "http://localhost/api/v1/namespaces/kube-system/configmaps/aws-auth": dial tcp [::1]:80: connect: connection refused
+```
+
+## Root Cause
+
+Terraform tries to validate the Kubernetes provider configuration during the `plan` phase. Since the EKS cluster doesn't exist yet, `module.eks.cluster_endpoint` is unknown, and Terraform may default to `localhost`, causing connection errors.
+
+## Solution
+
+The configuration has been updated to:
+
+1. **Added `depends_on` to Kubernetes resources** - Ensures resources wait for EKS cluster to be created
+2. **Added `null_resource` wait** - Waits for cluster to be fully active before creating Kubernetes resources
+
+## Workaround for First-Time Deployment
+
+### Option 1: Proceed with Apply (Recommended)
+
+Even if `terraform plan` shows Kubernetes connection errors, you can proceed with `terraform apply`. The errors are expected during plan when the cluster doesn't exist yet. During apply:
+
+1. EKS cluster will be created first
+2. `null_resource.wait_for_cluster` will wait for cluster to be ready
+3. Kubernetes resources will be created after cluster is ready
+
+**Command:**
+```bash
+# Plan will show errors, but proceed anyway
+terraform plan -out=tfplan
+
+# Apply will work correctly
+terraform apply tfplan
+```
+
+### Option 2: Two-Phase Deployment
+
+If you want to avoid plan errors, deploy in two phases:
+
+**Phase 1: Create EKS Cluster Only**
+```bash
+# Comment out Kubernetes resources temporarily in s3_flink_checkpoints.tf
+# Then run:
+terraform apply -target=module.eks
+```
+
+**Phase 2: Create Kubernetes Resources**
+```bash
+# Uncomment Kubernetes resources
+# Then run:
+terraform apply
+```
+
+### Option 3: Use `-target` Flag
+
+Target only AWS resources first, then Kubernetes resources:
+
+```bash
+# First, create EKS cluster and AWS resources
+terraform apply -target=module.eks -target=module.vpc -target=aws_s3_bucket.flink_state
+
+# Then create Kubernetes resources
+terraform apply
+```
+
+## Fixed Configuration
+
+The following changes were made:
+
+1. **Added `null_resource.wait_for_cluster`** - Waits for EKS cluster to be active
+2. **Updated `kubernetes_namespace.fluss` dependencies** - Now depends on cluster readiness
+3. **Added proper dependency chain** - Ensures correct order of resource creation
+
+## Verification
+
+After successful deployment, verify:
+
+```bash
+# Check cluster status
+aws eks describe-cluster --name <cluster-name> --region us-west-2 --query 'cluster.status'
+
+# Check Kubernetes namespace
+kubectl get namespace fluss
+
+# Check Terraform state
+terraform state list
+```
+
+## Notes
+
+- **Warnings about deprecated `inline_policy`** - These are warnings from the EKS module, not errors. They can be ignored or will be fixed in future module versions.
+- **First-time deployment** - Expect Kubernetes connection errors during plan on first deployment
+- **Subsequent plans** - After cluster exists, plan should work without errors
+
+## Related Files
+
+- `s3_flink_checkpoints.tf` - Contains Kubernetes namespace and service account resources
+- `main.tf` - Contains Kubernetes provider configuration
+- `eks_cluster.tf` - Contains EKS cluster module
+
diff --git a/e2e-iot/high-infra/terraform/apply.sh b/e2e-iot/high-infra/terraform/apply.sh
new file mode 100755
index 0000000..167aaa3
--- /dev/null
+++ b/e2e-iot/high-infra/terraform/apply.sh
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+# Wrapper script for terraform apply
+# Usage: ./apply.sh [terraform apply arguments]
+#
+# Note: ECR repositories are no longer managed by Terraform.
+# Create them manually via AWS CLI/Console and set repository URLs in terraform.tfvars
+
+set -euo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+cd "$SCRIPT_DIR"
+
+echo "=== Running terraform apply ==="
+terraform apply "$@"
+
diff --git a/e2e-iot/high-infra/terraform/destroy.tfplan b/e2e-iot/high-infra/terraform/destroy.tfplan
new file mode 100644
index 0000000..29f610a
Binary files /dev/null and b/e2e-iot/high-infra/terraform/destroy.tfplan differ
diff --git a/e2e-iot/high-infra/terraform/ebs-csi.tf b/e2e-iot/high-infra/terraform/ebs-csi.tf
new file mode 100644
index 0000000..bfb09b2
--- /dev/null
+++ b/e2e-iot/high-infra/terraform/ebs-csi.tf
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# EBS CSI Driver for gp3 storage support
+# Note: EBS CSI driver is now configured in eks_cluster.tf as part of cluster_addons
+# This file is kept for backward compatibility but the addon is managed by the EKS module
+
+# Output
+output "ebs_csi_driver_installed" {
+  description = "Whether EBS CSI driver addon is installed"
+  value       = var.install_ebs_csi_driver
+}
+
+output "ebs_csi_driver_role_arn" {
+  description = "IAM role ARN for EBS CSI driver"
+  value       = var.install_ebs_csi_driver ? module.ebs_csi_irsa[0].iam_role_arn : null
+}
+
diff --git a/e2e-iot/high-infra/terraform/ecr.tfplan b/e2e-iot/high-infra/terraform/ecr.tfplan
new file mode 100644
index 0000000..ab70bb5
Binary files /dev/null and b/e2e-iot/high-infra/terraform/ecr.tfplan differ
diff --git a/e2e-iot/high-infra/terraform/eks_cluster.tf b/e2e-iot/high-infra/terraform/eks_cluster.tf
new file mode 100644
index 0000000..c38476d
--- /dev/null
+++ b/e2e-iot/high-infra/terraform/eks_cluster.tf
@@ -0,0 +1,450 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# ================================================================================
+# EKS CLUSTER CREATION USING TERRAFORM AWS MODULES
+# ================================================================================
+# This configuration uses terraform-aws-modules/eks/aws and terraform-aws-modules/vpc/aws
+# to properly handle node joining and avoid manual aws-auth ConfigMap patching
+# ================================================================================
+
+# Get available availability zones
+data "aws_availability_zones" "available" {
+  state = "available"
+}
+
+# VPC Module - Creates VPC, subnets, NAT gateways, route tables automatically
+module "vpc" {
+  source  = "terraform-aws-modules/vpc/aws"
+  version = "~> 5.0"
+
+  name = "${var.eks_cluster_name}-vpc"
+  cidr = "10.0.0.0/16"
+
+  # EKS requires subnets in at least 2 different AZs
+  # We use 2 AZs for subnets but configure node groups to use only one AZ for cost savings
+  azs             = slice(data.aws_availability_zones.available.names, 0, 2)  # Two AZs (required by EKS)
+  # Increased subnet size from /24 (251 IPs) to /20 (4091 IPs) to prevent IP exhaustion
+  # This provides ~16x more IP addresses per subnet for pods and nodes
+  # Subnets distributed across 2 AZs - EKS requirement
+  private_subnets = ["10.0.0.0/20", "10.0.16.0/20"]   # 4091 usable IPs each - AZ 1 and AZ 2
+  public_subnets  = ["10.0.32.0/20", "10.0.48.0/20"]  # 4091 usable IPs each - AZ 1 and AZ 2
+
+  enable_nat_gateway   = true
+  single_nat_gateway   = true  # Use single NAT gateway to save costs (all nodes in one AZ)
+  enable_dns_hostnames = true
+  enable_dns_support   = true
+
+  public_subnet_tags = {
+    "kubernetes.io/role/elb"                    = "1"
+    "kubernetes.io/cluster/${var.eks_cluster_name}" = "shared"
+  }
+
+  private_subnet_tags = {
+    "kubernetes.io/role/internal-elb"              = "1"
+    "kubernetes.io/cluster/${var.eks_cluster_name}" = "shared"
+  }
+
+  tags = {
+    Name        = "${var.eks_cluster_name}-vpc"
+    Project     = "fluss-deployment"
+    Environment = var.environment
+    ManagedBy   = "terraform"
+  }
+}
+
+# Local variables for node group configurations
+locals {
+  # Coordinator node group configuration
+  coordinator_node_group = {
+    name           = "coordinator"
+    instance_types = [var.coordinator_instance_type]
+    capacity_type  = "ON_DEMAND"
+    min_size       = var.coordinator_instance_count
+    max_size       = var.coordinator_instance_count
+    desired_size   = var.coordinator_instance_count
+    disk_size      = 50
+    disk_type      = "gp3"
+    # Let EKS module automatically select the latest compatible AMI release version
+    # This ensures compatibility with the cluster Kubernetes version
+    subnet_ids     = [module.vpc.private_subnets[0]]  # Use only first AZ subnet
+
+    labels = {
+      "fluss-component" = "coordinator"
+      "node-type"       = "coordinator"
+      workload          = "fluss"
+      service           = "coordinator"
+    }
+
+    taints = [
+      {
+        key    = "fluss-component"
+        value  = "coordinator"
+        effect = "NO_SCHEDULE"
+      }
+    ]
+
+    tags = {
+      Name        = "${var.eks_cluster_name}-coordinator"
+      Component   = "coordinator"
+      Service     = "fluss"
+      Project     = "fluss-deployment"
+      Environment = var.environment
+    }
+
+    enable_monitoring = false
+    iam_role_additional_policies = {
+      AmazonEBSCSIDriverPolicy = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
+    }
+  }
+
+  # Tablet server node group configuration
+  tablet_server_node_group = {
+    name           = "tablet-server"
+    instance_types = [var.tablet_server_instance_type]
+    capacity_type  = "ON_DEMAND"
+    min_size       = var.tablet_server_instance_count
+    max_size       = var.tablet_server_instance_count
+    desired_size   = var.tablet_server_instance_count
+    disk_size      = 100
+    disk_type      = "gp3"
+    # Let EKS module automatically select the latest compatible AMI release version
+    # This ensures compatibility with the cluster Kubernetes version
+    subnet_ids     = [module.vpc.private_subnets[0]]  # Use only first AZ subnet
+
+    labels = {
+      "fluss-component" = "tablet-server"
+      "node-type"       = "tablet-server"
+      "node.kubernetes.io/instance-type" = var.tablet_server_instance_type
+      "storage-type"    = "nvme"
+      workload          = "fluss"
+      service           = "tablet-server"
+    }
+
+    taints = []
+
+    pre_bootstrap_user_data = <<-EOT
+      #!/bin/bash
+      # Format and mount NVMe drives for Fluss tablet servers
+      
+      # Wait for NVMe drives to be available
+      # Check for available NVMe devices (different instance types have different numbers)
+      NVME_COUNT=0
+      MAX_WAIT=60
+      WAIT_COUNT=0
+      
+      while [ $WAIT_COUNT -lt $MAX_WAIT ]; do
+        NVME_COUNT=$(lsblk -d -n -o NAME | grep -c "^nvme" || echo "0")
+        if [ $NVME_COUNT -gt 0 ]; then
+          break
+        fi
+        sleep 2
+        WAIT_COUNT=$((WAIT_COUNT + 2))
+      done
+      
+      # Format and mount NVMe drives
+      # For i7i.8xlarge: typically has 2 NVMe drives (nvme1n1, nvme2n1)
+      # For i3en.6xlarge: typically has 2 NVMe drives
+      # For r6id.4xlarge: typically has 1 NVMe drive (nvme1n1)
+      
+      if [ -e /dev/nvme1n1 ]; then
+        echo "Setting up NVMe drive /dev/nvme1n1 for Fluss tablet server (all data)..."
+        mkfs.ext4 -F /dev/nvme1n1
+        mkdir -p /opt/alldata
+        mount /dev/nvme1n1 /opt/alldata
+        echo "/dev/nvme1n1 /opt/alldata ext4 defaults,noatime 0 2" >> /etc/fstab
+        chmod 755 /opt/alldata
+        # Set ownership (assuming Fluss runs as UID 1000, adjust if needed)
+        chown -R 1000:1000 /opt/alldata || true
+        echo "NVMe drive /dev/nvme1n1 mounted to /opt/alldata"
+      fi
+      
+      # Create Fluss data directories under /opt/alldata
+      if [ -d /opt/alldata ]; then
+        mkdir -p /opt/alldata/fluss/data
+        mkdir -p /opt/alldata/fluss/remote-data
+        mkdir -p /opt/alldata/fluss/logs
+        chown -R 1000:1000 /opt/alldata/fluss || true
+        echo "Created Fluss directories: data, remote-data, logs"
+      fi
+      
+      # If second NVMe drive exists, use it for additional storage or leave unused
+      if [ -e /dev/nvme2n1 ]; then
+        echo "Second NVMe drive /dev/nvme2n1 detected but not configured (using single drive at /opt/alldata)"
+      fi
+      
+      echo "NVMe setup completed for Fluss tablet servers"
+    EOT
+
+    tags = {
+      Name        = "${var.eks_cluster_name}-tablet-server"
+      Component   = "tablet-server"
+      Service     = "fluss"
+      Project     = "fluss-deployment"
+      Environment = var.environment
+    }
+
+    enable_monitoring = false
+    iam_role_additional_policies = {
+      AmazonEBSCSIDriverPolicy = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
+    }
+  }
+
+  # Flink JobManager node group configuration
+  flink_jobmanager_node_group = {
+    name           = "flink-jobmanager"
+    instance_types = [var.flink_jobmanager_instance_type]
+    capacity_type  = "ON_DEMAND"
+    min_size       = 1
+    max_size       = 1
+    desired_size   = 1
+    disk_size      = 50
+    disk_type      = "gp3"
+    # Let EKS module automatically select the latest compatible AMI release version
+    # This ensures compatibility with the cluster Kubernetes version
+    subnet_ids     = [module.vpc.private_subnets[0]]  # Use only first AZ subnet
+
+    labels = {
+      "flink-component" = "jobmanager"
+      "node-type"       = "flink-jobmanager"
+      workload          = "flink"
+      service           = "flink-jobmanager"
+    }
+
+    taints = [
+      {
+        key    = "flink-component"
+        value  = "jobmanager"
+        effect = "NO_SCHEDULE"
+      }
+    ]
+
+    tags = {
+      Name        = "${var.eks_cluster_name}-flink-jobmanager"
+      Component   = "flink-jobmanager"
+      Service     = "flink"
+      Project     = "fluss-deployment"
+      Environment = var.environment
+    }
+
+    enable_monitoring = false
+    iam_role_additional_policies = {
+      AmazonEBSCSIDriverPolicy = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
+    }
+  }
+
+  # Flink TaskManager node group configuration (6 nodes)
+  flink_taskmanager_node_group = {
+    name           = "flink-taskmanager"
+    instance_types = [var.flink_taskmanager_instance_type]
+    capacity_type  = "ON_DEMAND"
+    min_size       = 6
+    max_size       = 6
+    desired_size   = 6
+    disk_size      = 100
+    disk_type      = "gp3"
+    # Let EKS module automatically select the latest compatible AMI release version
+    # This ensures compatibility with the cluster Kubernetes version
+    subnet_ids     = [module.vpc.private_subnets[0]]  # Use only first AZ subnet
+
+    labels = {
+      "flink-component" = "taskmanager"
+      "node-type"       = "flink-taskmanager"
+      workload          = "flink"
+      service           = "flink-taskmanager"
+    }
+
+    taints = [
+      {
+        key    = "flink-component"
+        value  = "taskmanager"
+        effect = "NO_SCHEDULE"
+      }
+    ]
+
+    tags = {
+      Name        = "${var.eks_cluster_name}-flink-taskmanager"
+      Component   = "flink-taskmanager"
+      Service     = "flink"
+      Project     = "fluss-deployment"
+      Environment = var.environment
+    }
+
+    enable_monitoring = false
+    iam_role_additional_policies = {
+      AmazonEBSCSIDriverPolicy = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
+    }
+  }
+
+  # Producer node group configuration
+  producer_node_group = {
+    name           = "producer"
+    instance_types = [var.producer_instance_type]
+    capacity_type  = "ON_DEMAND"
+    min_size       = var.producer_instance_count
+    max_size       = var.producer_instance_count
+    desired_size   = var.producer_instance_count
+    disk_size      = 50
+    disk_type      = "gp3"
+    # Let EKS module automatically select the latest compatible AMI release version
+    # This ensures compatibility with the cluster Kubernetes version
+    subnet_ids     = [module.vpc.private_subnets[0]]  # Use only first AZ subnet
+
+    labels = {
+      "producer-component" = "producer"
+      "node-type"          = "producer"
+      workload             = "producer"
+      service              = "producer"
+    }
+
+    taints = [
+      {
+        key    = "producer-component"
+        value  = "producer"
+        effect = "NO_SCHEDULE"
+      }
+    ]
+
+    tags = {
+      Name        = "${var.eks_cluster_name}-producer"
+      Component   = "producer"
+      Service     = "producer"
+      Project     = "fluss-deployment"
+      Environment = var.environment
+    }
+
+    enable_monitoring = false
+    iam_role_additional_policies = {
+      AmazonEBSCSIDriverPolicy = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
+    }
+  }
+}
+
+# EKS Module - Properly handles node joining automatically
+module "eks" {
+  source  = "terraform-aws-modules/eks/aws"
+  version = "~> 19.0"
+
+  cluster_name    = var.eks_cluster_name
+  cluster_version = var.kubernetes_version
+
+  vpc_id     = module.vpc.vpc_id
+  # EKS requires at least 2 subnets in different AZs
+  # Node groups are configured to use only the first AZ subnet for cost savings
+  subnet_ids = module.vpc.private_subnets
+
+  # Cluster endpoint access
+  cluster_endpoint_public_access  = true
+  cluster_endpoint_private_access = true
+
+  # Enable IRSA (IAM Roles for Service Accounts) - Required for EBS CSI driver
+  enable_irsa = true
+
+  # Cluster addons - Core addons only (EBS CSI will be installed separately)
+  cluster_addons = {
+    coredns = {
+      most_recent = true
+    }
+    kube-proxy = {
+      most_recent = true
+    }
+    vpc-cni = {
+      most_recent = true
+    }
+  }
+
+  # Enable cluster logging
+  cluster_enabled_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
+
+  # EKS Managed Node Groups - Automatically handles aws-auth ConfigMap
+  eks_managed_node_groups = {
+    coordinator = local.coordinator_node_group
+    tablet_server = local.tablet_server_node_group
+    flink_jobmanager = local.flink_jobmanager_node_group
+    flink_taskmanager = local.flink_taskmanager_node_group
+    producer = local.producer_node_group
+  }
+
+  # aws-auth configmap - Managed automatically by the module
+  manage_aws_auth_configmap = true
+
+  tags = {
+    Name        = var.eks_cluster_name
+    Project     = "fluss-deployment"
+    Environment = var.environment
+    ManagedBy   = "terraform"
+  }
+}
+
+# EBS CSI Driver IRSA (if enabled)
+# Created AFTER EKS module to get OIDC provider ARN
+module "ebs_csi_irsa" {
+  count = var.install_ebs_csi_driver ? 1 : 0
+
+  source  = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+  version = "~> 5.0"
+
+  role_name = "${var.eks_cluster_name}-ebs-csi-driver"
+
+  attach_ebs_csi_policy = true
+
+  oidc_providers = {
+    main = {
+      provider_arn               = module.eks.oidc_provider_arn
+      namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
+    }
+  }
+
+  tags = {
+    Name        = "${var.eks_cluster_name}-ebs-csi-driver"
+    Project     = "fluss-deployment"
+    Environment = var.environment
+    ManagedBy   = "terraform"
+  }
+
+  depends_on = [module.eks]  # Must wait for OIDC provider
+}
+
+# Install EBS CSI driver addon separately (after IRSA role is created)
+resource "aws_eks_addon" "ebs_csi_driver" {
+  count = var.install_ebs_csi_driver ? 1 : 0
+
+  cluster_name             = module.eks.cluster_name
+  addon_name               = "aws-ebs-csi-driver"
+  service_account_role_arn = module.ebs_csi_irsa[0].iam_role_arn
+  resolve_conflicts_on_create = "OVERWRITE"
+  resolve_conflicts_on_update  = "OVERWRITE"
+
+  tags = {
+    Name        = "${var.eks_cluster_name}-ebs-csi-driver"
+    Project     = "fluss-deployment"
+    Environment = var.environment
+    ManagedBy   = "terraform"
+  }
+
+  depends_on = [
+    module.ebs_csi_irsa[0],
+    module.eks
+  ]
+
+  timeouts {
+    create = "10m"
+    update = "10m"
+    delete = "10m"
+  }
+}
+
diff --git a/e2e-iot/high-infra/terraform/import-ecr.sh b/e2e-iot/high-infra/terraform/import-ecr.sh
new file mode 100755
index 0000000..3e36d0a
--- /dev/null
+++ b/e2e-iot/high-infra/terraform/import-ecr.sh
@@ -0,0 +1,68 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+# IMPORTANT: ECR repositories are NOT managed by Terraform to prevent accidental deletion
+# This script is disabled - ECR repositories should be created manually and NOT imported into Terraform state
+# If ECR repositories are in Terraform state, terraform destroy will delete them and all images!
+
+set -euo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+cd "$SCRIPT_DIR"
+
+AWS_REGION=${AWS_REGION:-us-west-2}
+
+echo "=========================================="
+echo "ECR Import Script - DISABLED"
+echo "=========================================="
+echo ""
+echo "ECR repositories are NOT managed by Terraform to prevent"
+echo "accidental deletion of images during terraform destroy."
+echo ""
+echo "ECR repositories should be:"
+echo "  1. Created manually via AWS CLI/Console"
+echo "  2. Images pushed using push-images-to-ecr.sh"
+echo "  3. NEVER imported into Terraform state"
+echo ""
+echo "If ECR repositories exist in Terraform state, remove them with:"
+echo "  terraform state rm aws_ecr_repository.demo_app"
+echo "  terraform state rm aws_ecr_repository.fluss"
+echo "  terraform state rm aws_ecr_lifecycle_policy.demo_app"
+echo "  terraform state rm aws_ecr_lifecycle_policy.fluss"
+echo ""
+echo "Checking if ECR repositories are in Terraform state (they should NOT be)..."
+echo ""
+
+# Check if ECR repositories are in state - warn if they are
+if terraform state show aws_ecr_repository.demo_app >/dev/null 2>&1; then
+  echo "  ⚠ WARNING: aws_ecr_repository.demo_app is in Terraform state!"
+  echo "     Remove it with: terraform state rm aws_ecr_repository.demo_app"
+else
+  echo "  ✓ aws_ecr_repository.demo_app is NOT in state (correct)"
+fi
+
+if terraform state show aws_ecr_repository.fluss >/dev/null 2>&1; then
+  echo "  ⚠ WARNING: aws_ecr_repository.fluss is in Terraform state!"
+  echo "     Remove it with: terraform state rm aws_ecr_repository.fluss"
+else
+  echo "  ✓ aws_ecr_repository.fluss is NOT in state (correct)"
+fi
+
+echo ""
+echo "ECR import check complete - no imports performed (by design)."
+
diff --git a/e2e-iot/high-infra/terraform/kubeconfig b/e2e-iot/high-infra/terraform/kubeconfig
new file mode 100644
index 0000000..a1fd3a9
--- /dev/null
+++ b/e2e-iot/high-infra/terraform/kubeconfig
@@ -0,0 +1,29 @@
+apiVersion: v1
+clusters:
+- cluster:
+    certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURCVENDQWUyZ0F3SUJBZ0lJZXo2bktUMkYyam93RFFZSktvWklodmNOQVFFTEJRQXdGVEVUTUJFR0ExVUUKQXhNS2EzVmlaWEp1WlhSbGN6QWVGdzB5TlRFeE1qSXhOVFEyTlRKYUZ3MHpOVEV4TWpBeE5UVXhOVEphTUJVeApFekFSQmdOVkJBTVRDbXQxWW1WeWJtVjBaWE13Z2dFaU1BMEdDU3FHU0liM0RRRUJBUVVBQTRJQkR3QXdnZ0VLCkFvSUJBUURpMWRoRllMdXh0MVAwOWRWaHhHMEt1TXZvdFhnbUU0Q3g4Nlhrc0g3U0dLNHNlOFRDanJwc2lDekkKdEZoNDMxNlJpNDE4QngzOUhpT0d3K0JKMjBIaVdETlFSKzhMemszQkthWDNndzNRLzZVa3R0c0tOL1RPZm9WNwpwTUJoYUdmTjBSRDVYQnJQWmd4K2FyS0NpVUYxbW1UeXNTbUVYT1IrZlNWTThoZjFDbW1pYzFPQnVnM2hpdEpGClJlTG51Q3pxQ0xhdW9CZ0pLY3h3U3RXSi9rM3o0cGIxOVErUkFFalVra3lYU3N4Q3F1TGVGQTJLSVdKUFBITysKMyt1SEVXbjBTS3RGNDkxdlY0SW1rcEZrbGc5a2FkaUZWbEhlWldFRXEyOGRndzRCQ0k2SEdqVG1XN3FQa1R2Swp2TEpCUkptaGloS1RmT0lKQm9ndzA3SFdNbzZqQWdNQkFBR2pXVEJYTUE0R0ExVWREd0VCL3dRRUF3SUNwREFQCkJnTlZIUk1CQWY4RUJUQURBUUgvTUIwR0ExVWREZ1FXQkJRSTA1ZU9JRGlVQTkvZnJtYXZ3N0JzUWM5bEJEQVYKQmdOVkhSRUVEakFNZ2dwcmRXSmxjbTVsZEdWek1BMEdDU3FHU0liM0RRRUJDd1VBQTRJQkFRQyt5YUMxNjE3LwpwRFJDbFgvU3ZCYmZXQ1RQZW9tc1pkR05jMWhaOXg2Z05MK1dwekhoYVhyUXpDQitzWDdvMlh4NFE0OE53am1qClg1NXUzaWVwM3lBdjBiQWVQNTlROTcrMTVCaDVEMCtFc1ZTZk9xejYyVXcyNExRZ0NSaHh1ZDdjanEwR1pxcTMKaFFxRTg0YjJ6ckxHc1JNb3B6QzRLa0VVL0pXM0V1a2QwamMyTzR5NlJlZ1l3cHBGenErY0YzVG5SamlRWlhRdgpuUGxNQjMyblFyOHhDYTYrV0k0Tjh3cmxmNkEvWXAzVG0yaThOcnIreGtGYitOMytVVERlVnNyRDFxVlR3U09tCkQ1Q3FaRmpHSVVUN3loaSs1V0FlOWRPQktOWTVTOGhWS0F5QWN6ZXFNamN2ek5sRmxzTnRkam03WmFNTXRCRnEKNFlHRFBwV0FOVFgxCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K
+    server: https://7E7E71501A041C9BE6148523A19BF783.sk1.us-west-2.eks.amazonaws.com
+  name: arn:aws:eks:us-west-2:343218179954:cluster/fluss-eks-cluster
+contexts:
+- context:
+    cluster: arn:aws:eks:us-west-2:343218179954:cluster/fluss-eks-cluster
+    user: arn:aws:eks:us-west-2:343218179954:cluster/fluss-eks-cluster
+  name: arn:aws:eks:us-west-2:343218179954:cluster/fluss-eks-cluster
+current-context: arn:aws:eks:us-west-2:343218179954:cluster/fluss-eks-cluster
+kind: Config
+preferences: {}
+users:
+- name: arn:aws:eks:us-west-2:343218179954:cluster/fluss-eks-cluster
+  user:
+    exec:
+      apiVersion: client.authentication.k8s.io/v1beta1
+      args:
+      - --region
+      - us-west-2
+      - eks
+      - get-token
+      - --cluster-name
+      - fluss-eks-cluster
+      - --output
+      - json
+      command: aws
diff --git a/e2e-iot/high-infra/terraform/main.tf b/e2e-iot/high-infra/terraform/main.tf
new file mode 100644
index 0000000..4509abe
--- /dev/null
+++ b/e2e-iot/high-infra/terraform/main.tf
@@ -0,0 +1,104 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+terraform {
+  required_version = ">= 1.0"
+
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+    kubernetes = {
+      source  = "hashicorp/kubernetes"
+      version = "~> 2.23"
+    }
+    helm = {
+      source  = "hashicorp/helm"
+      version = "~> 2.11"
+    }
+    null = {
+      source  = "hashicorp/null"
+      version = "~> 3.2"
+    }
+  }
+
+  # Uncomment and configure backend for remote state
+  # backend "s3" {
+  #   bucket = "your-terraform-state-bucket"
+  #   key    = "aws-deploy-fluss/terraform.tfstate"
+  #   region = "us-west-2"
+  # }
+}
+
+# Configure AWS Provider
+provider "aws" {
+  region = var.aws_region
+
+  default_tags {
+    tags = {
+      Project     = "fluss-deployment"
+      Environment = var.environment
+      ManagedBy   = "terraform"
+    }
+  }
+}
+
+# Get current AWS account and caller identity
+data "aws_caller_identity" "current" {}
+data "aws_region" "current" {}
+
+# Configure Kubernetes Provider - Uses EKS module outputs
+provider "kubernetes" {
+  host                   = module.eks.cluster_endpoint
+  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+
+  exec {
+    api_version = "client.authentication.k8s.io/v1beta1"
+    command     = "aws"
+    args = [
+      "eks",
+      "get-token",
+      "--cluster-name",
+      module.eks.cluster_name,
+      "--region",
+      var.aws_region
+    ]
+  }
+}
+
+# Configure Helm Provider - Uses EKS module outputs
+provider "helm" {
+  kubernetes {
+    host                   = module.eks.cluster_endpoint
+    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+
+    exec {
+      api_version = "client.authentication.k8s.io/v1beta1"
+      command     = "aws"
+      args = [
+        "eks",
+        "get-token",
+        "--cluster-name",
+        module.eks.cluster_name,
+        "--region",
+        var.aws_region
+      ]
+    }
+  }
+}
+
diff --git a/e2e-iot/high-infra/terraform/outputs.tf b/e2e-iot/high-infra/terraform/outputs.tf
new file mode 100644
index 0000000..54405b8
--- /dev/null
+++ b/e2e-iot/high-infra/terraform/outputs.tf
@@ -0,0 +1,98 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+output "eks_cluster_name" {
+  description = "Name of the EKS cluster"
+  value       = module.eks.cluster_name
+}
+
+output "eks_cluster_endpoint" {
+  description = "Endpoint for EKS control plane"
+  value       = module.eks.cluster_endpoint
+}
+
+output "coordinator_node_group_id" {
+  description = "Coordinator node group ID"
+  value       = module.eks.eks_managed_node_groups["coordinator"].node_group_id
+}
+
+output "tablet_server_node_group_id" {
+  description = "Tablet server node group ID"
+  value       = module.eks.eks_managed_node_groups["tablet_server"].node_group_id
+}
+
+output "vpc_id" {
+  description = "VPC ID"
+  value       = module.vpc.vpc_id
+}
+
+output "private_subnet_ids" {
+  description = "Private subnet IDs"
+  value       = module.vpc.private_subnets
+}
+
+output "public_subnet_ids" {
+  description = "Public subnet IDs"
+  value       = module.vpc.public_subnets
+}
+
+output "namespace" {
+  description = "Kubernetes namespace"
+  value       = var.namespace
+}
+
+output "fluss_coordinator_service" {
+  description = "Fluss coordinator service name"
+  value       = "coordinator-server-hs.${var.namespace}.svc.cluster.local:9124"
+}
+
+output "zookeeper_service" {
+  description = "ZooKeeper service name"
+  value       = "zk-svc.${var.namespace}.svc.cluster.local:2181"
+}
+
+output "demo_image_repository" {
+  description = "ECR repository URL for demo image (must be set in terraform.tfvars)"
+  value       = var.demo_image_repository != "" ? var.demo_image_repository : "Not configured - set demo_image_repository in terraform.tfvars"
+}
+
+output "fluss_image_repository" {
+  description = "ECR repository URL for Fluss image (must be set in terraform.tfvars)"
+  value       = var.use_ecr_for_fluss ? (var.fluss_image_repository != "" ? var.fluss_image_repository : "Not configured - set fluss_image_repository in terraform.tfvars") : "Using Docker Hub: apache/fluss"
+}
+
+output "grafana_url" {
+  description = "Grafana dashboard URL - Use: kubectl get svc -n monitoring prometheus-grafana"
+  value       = "Access Grafana via: kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80"
+}
+
+output "grafana_credentials" {
+  description = "Grafana login credentials"
+  value       = "Username: admin, Password: admin123"
+  sensitive   = false
+}
+
+output "prometheus_url" {
+  description = "Prometheus UI URL"
+  value       = "http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090"
+}
+
+output "monitoring_namespace" {
+  description = "Kubernetes namespace for monitoring"
+  value       = "monitoring"
+}
+
diff --git a/e2e-iot/high-infra/terraform/s3_flink_checkpoints.tf b/e2e-iot/high-infra/terraform/s3_flink_checkpoints.tf
new file mode 100644
index 0000000..ae1b895
--- /dev/null
+++ b/e2e-iot/high-infra/terraform/s3_flink_checkpoints.tf
@@ -0,0 +1,270 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# ================================================================================
+# S3 BUCKET FOR FLINK CHECKPOINTS AND SAVEPOINTS
+# ================================================================================
+# This configuration creates an S3 bucket for Flink state storage with proper
+# IAM permissions using IRSA (IAM Roles for Service Accounts)
+# ================================================================================
+
+# S3 Bucket for Flink Checkpoints and Savepoints
+resource "aws_s3_bucket" "flink_state" {
+  bucket = "${var.eks_cluster_name}-flink-state-${data.aws_caller_identity.current.account_id}"
+
+  force_destroy = true  # Allow terraform destroy to delete bucket even with objects/versions
+
+  tags = {
+    Name        = "${var.eks_cluster_name}-flink-state"
+    Project     = "fluss-deployment"
+    Environment = var.environment
+    ManagedBy   = "terraform"
+  }
+}
+
+resource "aws_s3_bucket_versioning" "flink_state" {
+  bucket = aws_s3_bucket.flink_state.id
+  versioning_configuration {
+    status = "Enabled"
+  }
+}
+
+resource "aws_s3_bucket_server_side_encryption_configuration" "flink_state" {
+  bucket = aws_s3_bucket.flink_state.id
+
+  rule {
+    apply_server_side_encryption_by_default {
+      sse_algorithm = "AES256"
+    }
+  }
+}
+
+resource "aws_s3_bucket_public_access_block" "flink_state" {
+  bucket = aws_s3_bucket.flink_state.id
+
+  block_public_acls       = true
+  block_public_policy     = true
+  ignore_public_acls      = true
+  restrict_public_buckets = true
+}
+
+# S3 Lifecycle policy for cost savings - delete old checkpoints
+resource "aws_s3_bucket_lifecycle_configuration" "flink_state" {
+  bucket = aws_s3_bucket.flink_state.id
+
+  rule {
+    id     = "delete-old-checkpoints"
+    status = "Enabled"
+
+    filter {
+      prefix = "flink-checkpoints/${var.eks_cluster_name}/"
+    }
+
+    expiration {
+      days = 7  # Delete checkpoints after 7 days
+    }
+
+    noncurrent_version_expiration {
+      noncurrent_days = 3
+    }
+
+    abort_incomplete_multipart_upload {
+      days_after_initiation = 1
+    }
+  }
+
+  rule {
+    id     = "delete-old-savepoints"
+    status = "Enabled"
+
+    filter {
+      prefix = "flink-savepoints/${var.eks_cluster_name}/"
+    }
+
+    expiration {
+      days = 30  # Keep savepoints longer (30 days)
+    }
+
+    noncurrent_version_expiration {
+      noncurrent_days = 7
+    }
+
+    abort_incomplete_multipart_upload {
+      days_after_initiation = 1
+    }
+  }
+}
+
+# IAM Policy for Flink S3 Access
+resource "aws_iam_policy" "flink_s3_access" {
+  name        = "${var.eks_cluster_name}-flink-s3-policy"
+  description = "S3 permissions for Flink checkpoints and savepoints"
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Effect = "Allow"
+        Action = [
+          "s3:ListBucket",
+          "s3:GetBucketLocation"
+        ]
+        Resource = [
+          aws_s3_bucket.flink_state.arn
+        ]
+      },
+      {
+        Effect = "Allow"
+        Action = [
+          "s3:GetObject",
+          "s3:PutObject",
+          "s3:DeleteObject"
+        ]
+        Resource = [
+          "${aws_s3_bucket.flink_state.arn}/*"
+        ]
+      }
+    ]
+  })
+
+  tags = {
+    Name        = "${var.eks_cluster_name}-flink-s3-policy"
+    Project     = "fluss-deployment"
+    Environment = var.environment
+    ManagedBy   = "terraform"
+  }
+}
+
+# IAM Role for Flink Service Account (IRSA)
+resource "aws_iam_role" "flink_s3_access" {
+  name = "${var.eks_cluster_name}-flink-s3-access"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Effect = "Allow"
+        Principal = {
+          Federated = module.eks.oidc_provider_arn
+        }
+        Action = "sts:AssumeRoleWithWebIdentity"
+        Condition = {
+          StringEquals = {
+            "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:sub" = "system:serviceaccount:${var.namespace}:flink"
+            "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:aud" = "sts.amazonaws.com"
+          }
+        }
+      }
+    ]
+  })
+
+  tags = {
+    Name        = "${var.eks_cluster_name}-flink-s3-access"
+    Project     = "fluss-deployment"
+    Environment = var.environment
+    ManagedBy   = "terraform"
+  }
+}
+
+# Attach S3 policy to IAM role
+resource "aws_iam_role_policy_attachment" "flink_s3_access" {
+  role       = aws_iam_role.flink_s3_access.name
+  policy_arn = aws_iam_policy.flink_s3_access.arn
+}
+
+# Wait for EKS cluster to be fully ready before creating Kubernetes resources
+# This ensures the Kubernetes API is accessible before Terraform tries to connect
+resource "null_resource" "wait_for_cluster" {
+  depends_on = [module.eks.cluster_id]
+
+  provisioner "local-exec" {
+    command = <<-EOT
+      echo "Waiting for EKS cluster to be ready..."
+      aws eks wait cluster-active \
+        --name ${module.eks.cluster_name} \
+        --region ${var.aws_region} || true
+      echo "Cluster is ready"
+    EOT
+  }
+
+  triggers = {
+    cluster_id = module.eks.cluster_id
+  }
+}
+
+# Create namespace for Flink service account
+# Depends on EKS cluster being created and ready
+resource "kubernetes_namespace" "fluss" {
+  depends_on = [
+    module.eks.cluster_id,
+    module.eks.cluster_endpoint,
+    null_resource.wait_for_cluster  # Wait for cluster to be fully ready
+  ]
+
+  metadata {
+    name = var.namespace
+    labels = {
+      Project     = "fluss-deployment"
+      Environment = var.environment
+      ManagedBy   = "terraform"
+    }
+  }
+}
+
+# Kubernetes Service Account for Flink with IRSA annotation
+resource "kubernetes_service_account" "flink" {
+  depends_on = [kubernetes_namespace.fluss]
+  
+  metadata {
+    name      = "flink"
+    namespace = var.namespace
+    annotations = {
+      "eks.amazonaws.com/role-arn" = aws_iam_role.flink_s3_access.arn
+    }
+    labels = {
+      app       = "flink"
+      component = "flink"
+    }
+  }
+}
+
+# Outputs
+output "flink_s3_bucket_name" {
+  description = "S3 bucket name for Flink checkpoints"
+  value       = aws_s3_bucket.flink_state.id
+}
+
+output "flink_s3_bucket_arn" {
+  description = "S3 bucket ARN for Flink checkpoints"
+  value       = aws_s3_bucket.flink_state.arn
+}
+
+output "flink_s3_checkpoint_path" {
+  description = "S3 path for Flink checkpoints"
+  value       = "s3://${aws_s3_bucket.flink_state.id}/flink-checkpoints/${var.eks_cluster_name}/"
+}
+
+output "flink_s3_savepoint_path" {
+  description = "S3 path for Flink savepoints"
+  value       = "s3://${aws_s3_bucket.flink_state.id}/flink-savepoints/${var.eks_cluster_name}/"
+}
+
+output "flink_iam_role_arn" {
+  description = "IAM role ARN for Flink S3 access"
+  value       = aws_iam_role.flink_s3_access.arn
+}
+
diff --git a/e2e-iot/high-infra/terraform/terraform.tfvars b/e2e-iot/high-infra/terraform/terraform.tfvars
new file mode 100644
index 0000000..cfc3b7e
--- /dev/null
+++ b/e2e-iot/high-infra/terraform/terraform.tfvars
@@ -0,0 +1,57 @@
+# Terraform variables for Fluss deployment
+# This file contains the actual ECR repository URLs after pushing images
+
+aws_region = "us-west-2"
+environment = "dev"
+eks_cluster_name = "fluss-eks-cluster"
+namespace = "fluss"
+
+# Fluss configuration
+fluss_version = "0.8.0-incubating"
+# ECR repository URL for Fluss image (updated after push-images-to-ecr.sh)
+fluss_image_repository = "343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss"
+use_ecr_for_fluss = true
+
+# Demo application image (ECR repository URL)
+# Updated after push-images-to-ecr.sh
+demo_image_repository = "343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo"
+demo_image_tag = "latest"
+
+# ZooKeeper configuration
+zookeeper_replicas = 1
+
+# Fluss configuration
+coordinator_replicas = 1
+tablet_server_replicas = 3
+
+# Storage configuration
+# Set to false to use root volume (gp3) of EC2 instances instead of separate EBS volumes
+enable_persistence = false  # Tablet servers will write to /tmp/fluss/data on root volume
+storage_class = "gp3"  # Only used if enable_persistence = true
+storage_size = "20Gi"  # Only used if enable_persistence = true
+
+# EBS CSI Driver (for gp3 PersistentVolumes)
+# Install EBS CSI driver addon - required if enable_persistence = true
+# Even if persistence is disabled, installing it allows future flexibility
+install_ebs_csi_driver = true
+
+# EC2 Instance Configuration for Fluss Nodes
+# These instances will be added to EKS cluster with specific labels
+coordinator_instance_type = "c5.2xlarge"
+tablet_server_instance_type = "i7i.8xlarge"
+coordinator_instance_count = 1
+tablet_server_instance_count = 3
+
+# Flink Instance Configuration
+flink_jobmanager_instance_type = "c5.4xlarge"
+flink_taskmanager_instance_type = "c5.4xlarge"
+
+# Producer Instance Configuration
+producer_instance_type = "c5.2xlarge"
+producer_instance_count = 4
+
+# EC2 Instance Configuration
+# key_name = "your-key-pair-name"  # Optional: SSH key for EC2 instances
+# subnet_ids = ["subnet-xxx", "subnet-yyy"]  # Required: Subnets where instances will be launched
+# security_group_ids = ["sg-xxx"]  # Optional: Additional security groups
+
diff --git a/e2e-iot/high-infra/terraform/terraform.tfvars.example b/e2e-iot/high-infra/terraform/terraform.tfvars.example
new file mode 100644
index 0000000..89220ca
--- /dev/null
+++ b/e2e-iot/high-infra/terraform/terraform.tfvars.example
@@ -0,0 +1,50 @@
+# Copy this file to terraform.tfvars and update with your values
+
+aws_region = "us-west-2"
+environment = "dev"
+eks_cluster_name = "your-eks-cluster-name"
+namespace = "fluss"
+
+# Fluss configuration
+fluss_version = "0.8.0-incubating"
+# ECR repository URL for Fluss image
+# Update with your AWS account ID: <account-id>.dkr.ecr.<region>.amazonaws.com/fluss
+fluss_image_repository = "343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss"
+use_ecr_for_fluss = true  # Set to false to use Docker Hub (apache/fluss)
+
+# Demo application image (ECR repository URL)
+# Update with your AWS account ID: <account-id>.dkr.ecr.<region>.amazonaws.com/fluss-demo
+demo_image_repository = "343218179954.dkr.ecr.us-west-2.amazonaws.com/fluss-demo"
+demo_image_tag = "latest"
+
+# ZooKeeper configuration
+zookeeper_replicas = 1
+
+# Fluss configuration
+coordinator_replicas = 1
+tablet_server_replicas = 3
+
+# Storage configuration
+# Set to false to use root volume (gp3) of EC2 instances instead of separate EBS volumes
+# When false, tablet servers write to /tmp/fluss/data on the root volume
+enable_persistence = false
+storage_class = "gp3"  # Only used if enable_persistence = true
+storage_size = "20Gi"  # Only used if enable_persistence = true
+
+# EBS CSI Driver (for gp3 PersistentVolumes)
+# Install EBS CSI driver addon - required if enable_persistence = true
+# Even if persistence is disabled, installing it allows future flexibility
+install_ebs_csi_driver = true
+
+# EC2 Instance Configuration for Fluss Nodes
+# These instances will be added to EKS cluster with specific labels
+coordinator_instance_type = "t3.medium"
+tablet_server_instance_type = "t3.medium"
+coordinator_instance_count = 1
+tablet_server_instance_count = 3
+
+# EC2 Instance Configuration
+# key_name = "your-key-pair-name"  # Optional: SSH key for EC2 instances
+# subnet_ids = ["subnet-xxx", "subnet-yyy"]  # Optional: Specific subnets for instances
+# security_group_ids = ["sg-xxx"]  # Optional: Additional security groups
+
diff --git a/e2e-iot/high-infra/terraform/tfplan b/e2e-iot/high-infra/terraform/tfplan
new file mode 100644
index 0000000..e0ec7db
Binary files /dev/null and b/e2e-iot/high-infra/terraform/tfplan differ
diff --git a/e2e-iot/high-infra/terraform/tfplan-phase1 b/e2e-iot/high-infra/terraform/tfplan-phase1
new file mode 100644
index 0000000..88cddbe
Binary files /dev/null and b/e2e-iot/high-infra/terraform/tfplan-phase1 differ
diff --git a/e2e-iot/high-infra/terraform/tfplan-single-az b/e2e-iot/high-infra/terraform/tfplan-single-az
new file mode 100644
index 0000000..403371e
Binary files /dev/null and b/e2e-iot/high-infra/terraform/tfplan-single-az differ
diff --git a/e2e-iot/high-infra/terraform/tfplan-single-az-fixed b/e2e-iot/high-infra/terraform/tfplan-single-az-fixed
new file mode 100644
index 0000000..b6fc2b8
Binary files /dev/null and b/e2e-iot/high-infra/terraform/tfplan-single-az-fixed differ
diff --git a/e2e-iot/high-infra/terraform/tfplan-subnet b/e2e-iot/high-infra/terraform/tfplan-subnet
new file mode 100644
index 0000000..a0e1d7d
Binary files /dev/null and b/e2e-iot/high-infra/terraform/tfplan-subnet differ
diff --git a/e2e-iot/high-infra/terraform/tfplan-subnet-upgrade b/e2e-iot/high-infra/terraform/tfplan-subnet-upgrade
new file mode 100644
index 0000000..47ec8b2
Binary files /dev/null and b/e2e-iot/high-infra/terraform/tfplan-subnet-upgrade differ
diff --git a/e2e-iot/high-infra/terraform/tfplan-taskmanager b/e2e-iot/high-infra/terraform/tfplan-taskmanager
new file mode 100644
index 0000000..71038a8
Binary files /dev/null and b/e2e-iot/high-infra/terraform/tfplan-taskmanager differ
diff --git a/e2e-iot/high-infra/terraform/variables.tf b/e2e-iot/high-infra/terraform/variables.tf
new file mode 100644
index 0000000..221968c
--- /dev/null
+++ b/e2e-iot/high-infra/terraform/variables.tf
@@ -0,0 +1,185 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+variable "aws_region" {
+  description = "AWS region for resources"
+  type        = string
+  default     = "us-west-2"
+}
+
+variable "environment" {
+  description = "Environment name (dev, staging, prod)"
+  type        = string
+  default     = "dev"
+}
+
+variable "eks_cluster_name" {
+  description = "Name of the EKS cluster to create"
+  type        = string
+  default     = "fluss-eks-cluster"
+}
+
+variable "kubernetes_version" {
+  description = "Kubernetes version for EKS cluster"
+  type        = string
+  default     = "1.29"
+}
+
+variable "namespace" {
+  description = "Kubernetes namespace for Fluss deployment"
+  type        = string
+  default     = "fluss"
+}
+
+variable "fluss_version" {
+  description = "Fluss version to deploy"
+  type        = string
+  default     = "0.8.0-incubating"
+}
+
+variable "fluss_image_repository" {
+  description = "Fluss Docker image repository (ECR URL or Docker Hub)"
+  type        = string
+  default     = "" # Will be set to ECR URL if use_ecr_for_fluss is true
+}
+
+variable "use_ecr_for_fluss" {
+  description = "Use ECR repository for Fluss image instead of Docker Hub"
+  type        = bool
+  default     = true
+}
+
+variable "demo_image_repository" {
+  description = "ECR repository for demo application image (fluss-demo)"
+  type        = string
+  default     = ""
+}
+
+variable "demo_image_tag" {
+  description = "Tag for demo application image"
+  type        = string
+  default     = "latest"
+}
+
+variable "zookeeper_replicas" {
+  description = "Number of ZooKeeper replicas"
+  type        = number
+  default     = 1
+}
+
+variable "coordinator_replicas" {
+  description = "Number of Fluss coordinator replicas"
+  type        = number
+  default     = 1
+}
+
+variable "tablet_server_replicas" {
+  description = "Number of Fluss tablet server replicas"
+  type        = number
+  default     = 3
+}
+
+variable "enable_persistence" {
+  description = "Enable persistent volumes for Fluss. If false, uses root volume (emptyDir). If true, requires EBS CSI driver."
+  type        = bool
+  default     = false
+}
+
+variable "install_ebs_csi_driver" {
+  description = "Install EBS CSI driver addon (required for gp3 PersistentVolumes if enable_persistence = true)"
+  type        = bool
+  default     = true
+}
+
+variable "storage_class" {
+  description = "Storage class for persistent volumes"
+  type        = string
+  default     = "gp3"
+}
+
+variable "storage_size" {
+  description = "Storage size for persistent volumes (e.g., 20Gi)"
+  type        = string
+  default     = "20Gi"
+}
+
+variable "coordinator_instance_type" {
+  description = "EC2 instance type for Fluss coordinator"
+  type        = string
+  default     = "c5.2xlarge"
+}
+
+variable "tablet_server_instance_type" {
+  description = "EC2 instance type for Fluss tablet servers (should have NVMe local storage like i7i.8xlarge, i3en.6xlarge, or r6id.4xlarge)"
+  type        = string
+  default     = "i7i.8xlarge"
+}
+
+variable "coordinator_instance_count" {
+  description = "Number of coordinator instances"
+  type        = number
+  default     = 1
+}
+
+variable "tablet_server_instance_count" {
+  description = "Number of tablet server instances"
+  type        = number
+  default     = 3
+}
+
+variable "producer_instance_type" {
+  description = "EC2 instance type for producer nodes"
+  type        = string
+  default     = "c5.2xlarge"
+}
+
+variable "producer_instance_count" {
+  description = "Number of producer instances"
+  type        = number
+  default     = 1
+}
+
+variable "flink_jobmanager_instance_type" {
+  description = "EC2 instance type for Flink JobManager nodes"
+  type        = string
+  default     = "c5.4xlarge"
+}
+
+variable "flink_taskmanager_instance_type" {
+  description = "EC2 instance type for Flink TaskManager nodes"
+  type        = string
+  default     = "c5.4xlarge"
+}
+
+variable "key_name" {
+  description = "AWS Key Pair name for EC2 instances"
+  type        = string
+  default     = ""
+}
+
+variable "subnet_ids" {
+  description = "List of subnet IDs for EC2 instances (should be private subnets)"
+  type        = list(string)
+  default     = []
+}
+
+variable "security_group_ids" {
+  description = "List of security group IDs for EC2 instances"
+  type        = list(string)
+  default     = []
+}
+
diff --git a/e2e-iot/port-forward-flink.sh b/e2e-iot/port-forward-flink.sh
new file mode 100755
index 0000000..5f9f8e8
--- /dev/null
+++ b/e2e-iot/port-forward-flink.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+NAMESPACE="fluss"
+LOCAL_PORT="8081"
+REMOTE_PORT="8081"
+SERVICE_NAME="flink-jobmanager"
+
+echo "=== Port Forwarding Flink JobManager ==="
+echo "Namespace: ${NAMESPACE}"
+echo "Service: ${SERVICE_NAME}"
+echo ""
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Check if service exists
+if ! kubectl get svc -n "${NAMESPACE}" "${SERVICE_NAME}" &> /dev/null; then
+    echo "ERROR: Service ${SERVICE_NAME} not found in namespace ${NAMESPACE}"
+    echo ""
+    echo "Available services in ${NAMESPACE} namespace:"
+    kubectl get svc -n "${NAMESPACE}" || echo "  No services found"
+    exit 1
+fi
+
+# Check if JobManager pod is running
+JOBMANAGER_POD=$(kubectl get pods -n "${NAMESPACE}" -l app=flink,component=jobmanager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+
+if [ -z "${JOBMANAGER_POD}" ]; then
+    echo "WARNING: Flink JobManager pod not found. Port-forward may fail."
+    echo ""
+    echo "Available pods in ${NAMESPACE} namespace:"
+    kubectl get pods -n "${NAMESPACE}" | grep -E "NAME|flink" || kubectl get pods -n "${NAMESPACE}"
+else
+    JOBMANAGER_STATUS=$(kubectl get pod -n "${NAMESPACE}" "${JOBMANAGER_POD}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "")
+    if [ "${JOBMANAGER_STATUS}" != "Running" ]; then
+        echo "WARNING: Flink JobManager pod is not Running (status: ${JOBMANAGER_STATUS})"
+    else
+        echo "✓ Flink JobManager pod is running: ${JOBMANAGER_POD}"
+    fi
+fi
+
+echo ""
+echo "Starting port-forward..."
+echo "  Local port: ${LOCAL_PORT}"
+echo "  Remote port: ${REMOTE_PORT}"
+echo ""
+echo "Access Flink Web UI at: http://localhost:${LOCAL_PORT}"
+echo ""
+echo "Press Ctrl+C to stop port-forwarding"
+echo ""
+
+# Start port-forward
+kubectl port-forward -n "${NAMESPACE}" "svc/${SERVICE_NAME}" "${LOCAL_PORT}:${REMOTE_PORT}"
+
diff --git a/e2e-iot/port-forward-grafana.sh b/e2e-iot/port-forward-grafana.sh
new file mode 100755
index 0000000..4167732
--- /dev/null
+++ b/e2e-iot/port-forward-grafana.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euo pipefail
+
+NAMESPACE="monitoring"
+LOCAL_PORT="3000"
+REMOTE_PORT="80"
+
+echo "=== Port Forwarding Grafana ==="
+echo "Namespace: ${NAMESPACE}"
+echo ""
+
+# Check kubectl is available
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed or not in PATH"
+    exit 1
+fi
+
+# Find Grafana service
+GRAFANA_SVC=$(kubectl get svc -n "${NAMESPACE}" -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+
+if [ -z "${GRAFANA_SVC}" ]; then
+    # Try alternative service name
+    GRAFANA_SVC=$(kubectl get svc -n "${NAMESPACE}" | grep -i grafana | awk '{print $1}' | head -1 || echo "")
+fi
+
+if [ -z "${GRAFANA_SVC}" ]; then
+    echo "ERROR: Grafana service not found in namespace ${NAMESPACE}"
+    echo ""
+    echo "Available services in ${NAMESPACE} namespace:"
+    kubectl get svc -n "${NAMESPACE}" || echo "  No services found"
+    exit 1
+fi
+
+echo "Found Grafana service: ${GRAFANA_SVC}"
+echo ""
+echo "Starting port-forward..."
+echo "  Local port: ${LOCAL_PORT}"
+echo "  Remote port: ${REMOTE_PORT}"
+echo ""
+echo "Access Grafana at: http://localhost:${LOCAL_PORT}"
+echo "  Username: admin"
+echo "  Password: admin123"
+echo ""
+echo "Press Ctrl+C to stop port-forwarding"
+echo ""
+
+# Start port-forward
+kubectl port-forward -n "${NAMESPACE}" "svc/${GRAFANA_SVC}" "${LOCAL_PORT}:${REMOTE_PORT}"
+
diff --git a/e2e-iot/push-images-to-ecr.sh b/e2e-iot/push-images-to-ecr.sh
new file mode 100755
index 0000000..5ae49c9
--- /dev/null
+++ b/e2e-iot/push-images-to-ecr.sh
@@ -0,0 +1,289 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#!/bin/bash
+set -euo pipefail
+
+# Script to build and push images to ECR:
+# 1. fluss-demo (for producer and flink aggregator)
+# 2. fluss (Apache Fluss image)
+#
+# Usage:
+#   ./push-images-to-ecr.sh --all              # Push both images
+#   ./push-images-to-ecr.sh --producer-only    # Push only producer image
+#   ./push-images-to-ecr.sh --fluss-only       # Push only Fluss image
+#
+# IMPORTANT: This script must be run from the e2e-platform-aws directory
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+BASE_DIR="${SCRIPT_DIR}"
+DEMO_DIR="${BASE_DIR}/fluss_flink_realtime"
+AWS_REGION=${AWS_REGION:-us-west-2}
+FLUSS_VERSION=${FLUSS_VERSION:-0.8.0-incubating}
+ECR_INFO_FILE="${BASE_DIR}/ecr-repositories.txt"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+# Validate we're in the correct directory structure
+if [ ! -d "${DEMO_DIR}" ]; then
+    echo -e "${RED}Error: Cannot find fluss_flink_realtime directory${NC}"
+    echo -e "${RED}Expected: ${DEMO_DIR}${NC}"
+    echo -e "${RED}Please run this script from the e2e-platform-aws directory${NC}"
+    exit 1
+fi
+
+# Validate we're running from e2e-platform-aws directory
+EXPECTED_BASE_NAME="e2e-platform-aws"
+ACTUAL_BASE_NAME=$(basename "${BASE_DIR}")
+if [ "${ACTUAL_BASE_NAME}" != "${EXPECTED_BASE_NAME}" ]; then
+    echo -e "${RED}Error: Script must be run from the ${EXPECTED_BASE_NAME} directory${NC}"
+    echo -e "${RED}Current directory: ${BASE_DIR}${NC}"
+    echo -e "${RED}Please run: cd ${EXPECTED_BASE_NAME} && ./push-images-to-ecr.sh${NC}"
+    exit 1
+fi
+
+# Parse command line arguments
+PUSH_DEMO=false
+PUSH_FLUSS=false
+
+case "${1:-}" in
+    --all)
+        PUSH_DEMO=true
+        PUSH_FLUSS=true
+        ;;
+    --producer-only)
+        PUSH_DEMO=true
+        PUSH_FLUSS=false
+        ;;
+    --fluss-only)
+        PUSH_DEMO=false
+        PUSH_FLUSS=true
+        ;;
+    *)
+        echo -e "${RED}Error: Missing or invalid argument${NC}"
+        echo -e "Usage:"
+        echo -e "  $0 --all            # Push both images"
+        echo -e "  $0 --producer-only  # Push only producer image"
+        echo -e "  $0 --fluss-only     # Push only Fluss image"
+        exit 1
+        ;;
+esac
+
+echo -e "${GREEN}=== Building and Pushing Images to ECR ===${NC}\n"
+if [ "$PUSH_DEMO" = true ] && [ "$PUSH_FLUSS" = true ]; then
+    echo -e "${YELLOW}Mode: Push both producer and Fluss images${NC}\n"
+elif [ "$PUSH_DEMO" = true ]; then
+    echo -e "${YELLOW}Mode: Push only producer image${NC}\n"
+elif [ "$PUSH_FLUSS" = true ]; then
+    echo -e "${YELLOW}Mode: Push only Fluss image${NC}\n"
+fi
+
+# Get AWS account ID
+AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
+if [ -z "$AWS_ACCOUNT_ID" ]; then
+    echo -e "${RED}Error: Unable to get AWS account ID. Is AWS CLI configured?${NC}"
+    exit 1
+fi
+
+ECR_BASE="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com"
+DEMO_REPO="${ECR_BASE}/fluss-demo"
+FLUSS_REPO="${ECR_BASE}/fluss"
+
+echo -e "${YELLOW}AWS Account ID: ${AWS_ACCOUNT_ID}${NC}"
+echo -e "${YELLOW}AWS Region: ${AWS_REGION}${NC}"
+echo -e "${YELLOW}Demo Repository: ${DEMO_REPO}${NC}"
+echo -e "${YELLOW}Fluss Repository: ${FLUSS_REPO}${NC}\n"
+
+# Setup Docker buildx for cross-platform builds (ARM64 -> linux/amd64 for AWS)
+echo -e "${YELLOW}[1/6] Setting up Docker buildx for cross-platform builds...${NC}"
+BUILDER_NAME="fluss-multiplatform"
+if ! docker buildx inspect "${BUILDER_NAME}" &>/dev/null; then
+    echo -e "${YELLOW}Creating buildx builder for linux/amd64 platform...${NC}"
+    docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use --bootstrap 2>/dev/null || {
+        echo -e "${YELLOW}Builder creation failed, using default...${NC}"
+        BUILDER_NAME="default"
+    }
+else
+    docker buildx use "${BUILDER_NAME}" 2>/dev/null || BUILDER_NAME="default"
+    docker buildx inspect --bootstrap "${BUILDER_NAME}" &>/dev/null || true
+fi
+echo -e "${GREEN}✓ Buildx builder ready${NC}\n"
+
+# Login to ECR
+echo -e "${YELLOW}[2/6] Logging in to ECR...${NC}"
+aws ecr get-login-password --region "${AWS_REGION}" | \
+    docker login --username AWS --password-stdin "${ECR_BASE}"
+echo -e "${GREEN}✓ Logged in to ECR${NC}\n"
+
+# Ensure ECR repositories exist (they should be created by Terraform)
+echo -e "${YELLOW}[3/6] Checking ECR repositories...${NC}"
+if ! aws ecr describe-repositories --repository-names fluss-demo --region "${AWS_REGION}" >/dev/null 2>&1; then
+    echo -e "${YELLOW}Creating fluss-demo repository...${NC}"
+    aws ecr create-repository --repository-name fluss-demo --region "${AWS_REGION}" >/dev/null
+fi
+if ! aws ecr describe-repositories --repository-names fluss --region "${AWS_REGION}" >/dev/null 2>&1; then
+    echo -e "${YELLOW}Creating fluss repository...${NC}"
+    aws ecr create-repository --repository-name fluss --region "${AWS_REGION}" >/dev/null
+fi
+echo -e "${GREEN}✓ ECR repositories ready${NC}\n"
+
+# Build and push producer application image
+if [ "$PUSH_DEMO" = true ]; then
+    echo -e "${YELLOW}[4/6] Building producer application image...${NC}"
+    echo -e "${YELLOW}Step 1: Building JAR from source (clean build)...${NC}"
+    cd "${DEMO_DIR}"
+    mvn clean package
+    JAR_FILE=$(find "${DEMO_DIR}/target" -name "fluss-flink-realtime-demo*.jar" -type f 2>/dev/null | head -1)
+    if [ -z "${JAR_FILE}" ] || [ ! -f "${JAR_FILE}" ]; then
+        echo -e "${RED}Error: JAR file not found after build${NC}"
+        exit 1
+    fi
+    echo -e "${GREEN}✓ JAR built successfully: ${JAR_FILE}${NC}"
+    echo ""
+
+    cd "${DEMO_DIR}"
+    echo -e "${YELLOW}Step 2: Building Docker image for linux/amd64 (AWS compatible)...${NC}"
+    # Build for linux/amd64 platform using buildx (required for AWS EC2)
+    # Retry up to 3 times in case of transient network/mirror issues
+    MAX_RETRIES=3
+    RETRY_COUNT=0
+    BUILD_SUCCESS=false
+    
+    while [ ${RETRY_COUNT} -lt ${MAX_RETRIES} ] && [ "${BUILD_SUCCESS}" = false ]; do
+        if [ ${RETRY_COUNT} -gt 0 ]; then
+            echo -e "${YELLOW}Retry attempt ${RETRY_COUNT}/${MAX_RETRIES}...${NC}"
+            sleep 10
+        fi
+        
+        if docker buildx build --builder "${BUILDER_NAME}" --platform linux/amd64 --load -t fluss-demo:latest .; then
+            BUILD_SUCCESS=true
+            echo -e "${GREEN}✓ Docker image built successfully${NC}"
+        else
+            RETRY_COUNT=$((RETRY_COUNT + 1))
+            if [ ${RETRY_COUNT} -ge ${MAX_RETRIES} ]; then
+                echo -e "${RED}✗ Docker buildx build failed after ${MAX_RETRIES} attempts${NC}"
+                echo -e "${RED}This may be due to network/mirror issues. Please check your connection and try again.${NC}"
+                exit 1
+            fi
+        fi
+    done
+    TIMESTAMP=$(date +%Y%m%d-%H%M%S)
+    docker tag fluss-demo:latest "${DEMO_REPO}:latest"
+    docker tag fluss-demo:latest "${DEMO_REPO}:${TIMESTAMP}"
+
+    echo -e "${YELLOW}Step 3: Pushing producer image to ECR...${NC}"
+    docker push "${DEMO_REPO}:latest"
+    docker push "${DEMO_REPO}:${TIMESTAMP}"
+    echo -e "${GREEN}✓ Producer image pushed to ${DEMO_REPO}${NC}\n"
+else
+    echo -e "${YELLOW}[4/6] Skipping producer image (not requested)${NC}\n"
+fi
+
+# Pull, tag, and push Fluss image
+if [ "$PUSH_FLUSS" = true ]; then
+    echo -e "${YELLOW}[5/6] Pulling Apache Fluss image from Docker Hub (linux/amd64)...${NC}"
+    FLUSS_IMAGE="apache/fluss:${FLUSS_VERSION}"
+    docker pull --platform linux/amd64 "${FLUSS_IMAGE}"
+    echo -e "${GREEN}✓ Fluss image pulled${NC}"
+
+    echo -e "${YELLOW}Tagging Fluss image for ECR...${NC}"
+    docker tag "${FLUSS_IMAGE}" "${FLUSS_REPO}:${FLUSS_VERSION}"
+    docker tag "${FLUSS_IMAGE}" "${FLUSS_REPO}:latest"
+
+    echo -e "${YELLOW}Pushing Fluss image to ECR...${NC}"
+    docker push "${FLUSS_REPO}:${FLUSS_VERSION}"
+    docker push "${FLUSS_REPO}:latest"
+    echo -e "${GREEN}✓ Fluss image pushed to ${FLUSS_REPO}${NC}\n"
+else
+    echo -e "${YELLOW}[5/6] Skipping Fluss image (not requested)${NC}\n"
+fi
+
+# Summary
+echo -e "${GREEN}=== Image Push Complete ===${NC}\n"
+echo -e "Images pushed:"
+if [ "$PUSH_DEMO" = true ]; then
+    echo -e "  ${DEMO_REPO}:latest"
+fi
+if [ "$PUSH_FLUSS" = true ]; then
+    echo -e "  ${FLUSS_REPO}:${FLUSS_VERSION}"
+    echo -e "  ${FLUSS_REPO}:latest"
+fi
+echo -e ""
+
+# Save ECR repository details to file
+echo -e "${YELLOW}[6/6] Saving ECR repository details to ${ECR_INFO_FILE}...${NC}"
+cat > "${ECR_INFO_FILE}" << EOF
+# ECR Repository Details
+# Generated on: $(date)
+# AWS Account ID: ${AWS_ACCOUNT_ID}
+# AWS Region: ${AWS_REGION}
+
+EOF
+
+if [ "$PUSH_DEMO" = true ]; then
+    cat >> "${ECR_INFO_FILE}" << EOF
+# Demo/Producer Image Repository
+DEMO_IMAGE_REPOSITORY="${DEMO_REPO}"
+DEMO_IMAGE_TAG="latest"
+
+# For terraform.tfvars:
+demo_image_repository = "${DEMO_REPO}"
+
+EOF
+fi
+
+if [ "$PUSH_FLUSS" = true ]; then
+    cat >> "${ECR_INFO_FILE}" << EOF
+# Fluss Image Repository
+FLUSS_IMAGE_REPOSITORY="${FLUSS_REPO}"
+FLUSS_IMAGE_VERSION="${FLUSS_VERSION}"
+
+# For terraform.tfvars:
+fluss_image_repository = "${FLUSS_REPO}"
+use_ecr_for_fluss = true
+
+EOF
+fi
+
+cat >> "${ECR_INFO_FILE}" << EOF
+# Full ECR Base URL
+ECR_BASE="${ECR_BASE}"
+
+# To use these values in shell scripts:
+# source ${ECR_INFO_FILE}
+# echo \${DEMO_IMAGE_REPOSITORY}
+EOF
+
+echo -e "${GREEN}✓ ECR repository details saved to ${ECR_INFO_FILE}${NC}"
+echo -e ""
+echo -e "To use these values:"
+echo -e "  source ${ECR_INFO_FILE}"
+echo -e ""
+echo -e "Or update terraform.tfvars with:"
+if [ "$PUSH_DEMO" = true ]; then
+    echo -e "  demo_image_repository = \"${DEMO_REPO}\""
+fi
+if [ "$PUSH_FLUSS" = true ]; then
+    echo -e "  fluss_image_repository = \"${FLUSS_REPO}\""
+    echo -e "  use_ecr_for_fluss = true"
+fi
+echo -e ""
+