diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/ProVideoEditorPlugin.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/ProVideoEditorPlugin.kt
index d41af78..e6741e9 100644
--- a/android/src/main/kotlin/ch/waio/pro_video_editor/ProVideoEditorPlugin.kt
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/ProVideoEditorPlugin.kt
@@ -303,7 +303,7 @@ class ProVideoEditorPlugin : FlutterPlugin, MethodCallHandler {
                     }
                 },
                 onError = { error ->
-                    Log.e("RenderVideo", "Error rendering video: ${error.message}")
+                    Log.e("RenderVideo", "Error rendering video: ${error.message}", error)
                     mainHandler.post {
                         val removedTask = activeRenderTasks.remove(id)
                         val code = if (removedTask?.canceled?.get() == true) {
@@ -321,9 +321,11 @@ class ProVideoEditorPlugin : FlutterPlugin, MethodCallHandler {
                 jobHandle.cancel()
             }
         } catch (e: IllegalArgumentException) {
+            Log.e("RenderVideo", "Error rendering video: ${e.message}", e)
             activeRenderTasks.remove(id)
             result.error("INVALID_ARGUMENTS", e.message, null)
         } catch (e: Exception) {
+            Log.e("RenderVideo", "Error rendering video: ${e.message}", e)
             activeRenderTasks.remove(id)
             result.error("RENDER_ERROR", "Failed to start render: ${e.message}", null)
         }
diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/RenderVideo.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/RenderVideo.kt
index 4051de8..1abbd5f 100644
--- a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/RenderVideo.kt
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/RenderVideo.kt
@@ -18,8 +18,10 @@ import java.util.concurrent.atomic.AtomicReference
 import applyBitrate
 import mapFormatToMimeType
 import ch.waio.pro_video_editor.src.features.render.helpers.applyComposition
+import ch.waio.pro_video_editor.src.features.render.helpers.CompositionBuilder
 import ch.waio.pro_video_editor.src.features.render.helpers.VolumeControlAudioMixerFactory
 import ch.waio.pro_video_editor.src.features.render.helpers.ConfigurableInAppMp4Muxer
+import ch.waio.pro_video_editor.src.features.render.helpers.MediaInfoExtractor
 import ch.waio.pro_video_editor.src.features.render.helpers.VideoTranscoder
 import ch.waio.pro_video_editor.src.features.render.models.RenderConfig
 import ch.waio.pro_video_editor.src.features.render.models.RenderJobHandle
@@ -57,27 +59,59 @@ class RenderVideo(private val context: Context) {
      * Checks if transcoding is needed for video compatibility.
      * 
      * Transcoding is needed when:
-     * 1. GPU effects are used with HEVC 10-bit HDR videos
+     * 1. GPU effects are used with HEVC 10-bit HDR videos.
      * 2. Multiple videos are being merged and at least one is HEVC 10-bit
-     *    (mixing different codecs in a composition can cause frame processing errors)
+     *    (mixing different codecs in a composition can cause frame processing errors).
+     * 3. Multi-channel audio (5.1/7.1) is detected AND more than one unmuted audio source
+     *    is present (mixing multi-channel with stereo or silence causes mixer reconfiguration errors).
      */
     private fun needsPreTranscoding(config: RenderConfig): Boolean {
-        // Check for GPU effects
+        // 1. Check for HEVC 10-bit / HDR compatibility with GPU effects
         if (hasGpuEffects(config)) {
-            return true
+            val hasAnyHevc10bit = config.videoClips.any { clip ->
+                val info = MediaInfoExtractor.getVideoFormatInfo(clip.inputPath)
+                info.isHevc && info.bitDepth == 10
+            }
+            if (hasAnyHevc10bit) {
+                Log.d(RENDER_TAG, "HEVC 10-bit detected with GPU effects, pre-transcoding needed")
+                return true
+            }
+        }
+
+        // 2. Check for multi-channel audio mixing complexity
+        val unmutedVideoClips = config.videoClips.filter { (it.volume ?: 1.0f) > 0.0f }
+        val audibleCustomTracks = config.audioTracks.filter { it.volume > 0.0f }
+        val totalAudibleSources = unmutedVideoClips.size + audibleCustomTracks.size
+
+        val hasMultiChannel = config.videoClips.any { clip ->
+            (MediaInfoExtractor.getAudioChannelCount(clip.inputPath) ?: 2) > 2
+        }
+
+        if (hasMultiChannel) {
+            // We only MUST transcode multi-channel audio if it needs to be mixed with something else.
+            // If it's the only source, Media3 can handle downmixing via AudioProcessors during render.
+            if (totalAudibleSources > 1) {
+                Log.d(RENDER_TAG, "Multi-channel audio detected with multiple sources, pre-transcoding for mixing safety")
+                return true
+            }
+            
+            // TODO having multiple video clips (even if muted), mixing/overlapping transitions
+            //      can trigger audio reconfiguration errors if formats don't match perfectly
+            if (config.videoClips.size > 1) {
+                Log.d(RENDER_TAG, "Multi-channel audio detected in multi-clip merge, pre-transcoding for transition stability")
+                return true
+            }
         }
 
-        // When multiple clips are being merged, check if any need transcoding
-        // Mixing different codecs (HEVC + H.264) can cause frame processing errors
+        // 3. Codec mixing check (H.264 + HEVC)
+        // Mixing different codecs (HEVC + H.264) in a single sequence can cause frame processing errors
         if (config.videoClips.size > 1) {
             val hasAnyHevc10bit = config.videoClips.any { clip ->
-                VideoTranscoder.needsTranscoding(clip.inputPath)
+                val info = MediaInfoExtractor.getVideoFormatInfo(clip.inputPath)
+                info.isHevc && info.bitDepth == 10
             }
             if (hasAnyHevc10bit) {
-                Log.d(
-                    RENDER_TAG, "Multiple video clips with HEVC 10-bit detected, " +
-                            "pre-transcoding to ensure codec compatibility"
-                )
+                Log.d(RENDER_TAG, "HEVC 10-bit detected in merge, pre-transcoding to ensure consistency")
                 return true
             }
         }
@@ -138,8 +172,8 @@ class RenderVideo(private val context: Context) {
                     val updatedClips = config.videoClips.map { clip ->
                         val newPath = transcodeMap[clip.inputPath] ?: clip.inputPath
                         if (newPath != clip.inputPath) {
-                            // If transcoded, use the new path but keep trim times and volume
-                            VideoClip(newPath, clip.startUs, clip.endUs, clip.volume)
+                            // If transcoded, use the new path but preserve all other parameters
+                            clip.copy(inputPath = newPath)
                         } else {
                             clip
                         }
@@ -241,10 +275,39 @@ class RenderVideo(private val context: Context) {
         // Check if we need custom audio mixing with volume control
         val hasCustomAudio = config.audioTracks.isNotEmpty()
 
-        // Determine if video audio will be present in the mix
-        // Video audio is removed when audio is disabled or all clips have volume 0
-        val videoAudioPresent = config.enableAudio &&
-                config.videoClips.any { (it.volume ?: 1.0f) > 0.0f }
+        // Determine how many video sequences will contribute to the audio mix
+        // A video sequence has audio if audio is enabled AND at least one clip has volume > 0
+        val videoAudioSourceCount: Int
+        val videoSequenceVolumes: List<Float>
+        
+        if (config.enableAudio) {
+            val needsMultipleSequences = config.videoClips.any {
+                it.x != null || it.y != null || it.width != null || it.height != null || 
+                it.segmentTimeUs != null || it.opacity != null || (it.zIndex ?: 0) != 0
+            }
+            if (needsMultipleSequences) {
+                // In Pip mode, each clip is a separate sequence. 
+                // IMPORTANT: Match the exact layering logic from CompositionBuilder.
+                // 1. Higher zIndex on top.
+                // 2. Default zIndex is 0.
+                // 3. If zIndex is same, latter segment in input list is on top.
+                //
+                // In Media3, the first sequence in the list is the bottom-most layer.
+                // By using a stable ascending sort, we satisfy the rules.
+                val sortedClips = config.videoClips.sortedByDescending { it.zIndex ?: 0 }
+                val activeClips = sortedClips.filter { (it.volume ?: 1.0f) > 0.0f }
+                videoAudioSourceCount = activeClips.size
+                videoSequenceVolumes = activeClips.map { it.volume ?: 1.0f }
+            } else {
+                // Single sequence. Has audio if any clip is unmuted.
+                val isUnmuted = config.videoClips.any { (it.volume ?: 1.0f) > 0.0f }
+                videoAudioSourceCount = if (isUnmuted) 1 else 0
+                videoSequenceVolumes = if (isUnmuted) listOf(1.0f) else emptyList()
+            }
+        } else {
+            videoAudioSourceCount = 0
+            videoSequenceVolumes = emptyList()
+        }
 
         // Build transformer with callbacks
         val transformerBuilder = Transformer.Builder(context)
@@ -266,7 +329,8 @@ class RenderVideo(private val context: Context) {
             transformerBuilder.setAudioMixerFactory(
                 VolumeControlAudioMixerFactory(
                     trackVolumes = trackVolumes,
-                    videoAudioPresent = videoAudioPresent
+                    videoAudioSourceCount = videoAudioSourceCount,
+                    videoSequenceVolumes = videoSequenceVolumes
                 )
             )
         }
@@ -310,16 +374,25 @@ class RenderVideo(private val context: Context) {
         // Create composition (now fast - no manual audio mixing needed, Media3 handles it natively)
         Thread {
             try {
-                val composition = applyComposition(
-                    context = context,
-                    config = config,
-                    videoEffects = videoEffects,
-                    audioEffects = audioEffects
-                )
+                val compositionBuilder = CompositionBuilder(config, context)
+                val composition = compositionBuilder
+                    .setVideoEffects(videoEffects)
+                    .setAudioEffects(audioEffects)
+                    .build()
 
                 mainHandler.post {
                     if (composition != null) {
                         transformer.start(composition, outputFile.absolutePath)
+                        
+                        // Register release on transformer end
+                        transformer.addListener(object : Transformer.Listener {
+                            override fun onCompleted(c: Composition, r: ExportResult) {
+                                compositionBuilder.release()
+                            }
+                            override fun onError(c: Composition, r: ExportResult, e: ExportException) {
+                                compositionBuilder.release()
+                            }
+                        })
 
                         // Start progress tracking loop
                         val progressHolder = ProgressHolder()
@@ -339,6 +412,7 @@ class RenderVideo(private val context: Context) {
                             }
                         })
                     } else {
+                        compositionBuilder.release()
                         onError(IllegalStateException("Failed to create composition"))
                     }
                 }
diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/ApplyComposition.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/ApplyComposition.kt
index 3582caf..0205b90 100644
--- a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/ApplyComposition.kt
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/ApplyComposition.kt
@@ -27,7 +27,7 @@ fun applyComposition(
     videoEffects: List<Effect>,
     audioEffects: List<AudioProcessor>
 ): Composition? {
-    return CompositionBuilder(context, config)
+    return CompositionBuilder(config, context)
         .setVideoEffects(videoEffects)
         .setAudioEffects(audioEffects)
         .build()
diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/ApplyImageLayer.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/ApplyImageLayer.kt
index 0e08a58..c935f7a 100644
--- a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/ApplyImageLayer.kt
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/ApplyImageLayer.kt
@@ -67,7 +67,7 @@ fun applyImageLayer(
 @UnstableApi
 fun applyTimedImageLayers(
     videoEffects: MutableList<Effect>,
-    imageLayers: List<VideoSequenceBuilder.ImageLayerConfig>,
+    imageLayers: List<ImageLayer>,
     videoWidth: Int,
     videoHeight: Int
 ) {
@@ -79,7 +79,7 @@ fun applyTimedImageLayers(
     )
     for (layer in imageLayers) {
         try {
-            val imageBytes = layer.imageBytes ?: continue
+            val imageBytes = layer.imageData
             val options = BitmapFactory.Options().apply {
                 inPreferredConfig = Bitmap.Config.ARGB_8888
             }
diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/ApplyOpacity.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/ApplyOpacity.kt
new file mode 100644
index 0000000..d79ab0d
--- /dev/null
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/ApplyOpacity.kt
@@ -0,0 +1,24 @@
+package ch.waio.pro_video_editor.src.features.render.helpers
+
+import RENDER_TAG
+import androidx.media3.common.Effect
+import androidx.media3.common.util.UnstableApi
+import androidx.media3.effect.AlphaScale
+import ch.waio.pro_video_editor.src.shared.logging.PluginLog as Log
+
+/**
+ * Applies opacity to a video segment.
+ *
+ * @param videoEffects List to add opacity effect to
+ * @param opacity Transparency factor (0.0 to 1.0)
+ */
+@UnstableApi
+fun applyOpacity(
+    videoEffects: MutableList<Effect>,
+    opacity: Float?
+) {
+    if (opacity == null || opacity >= 1.0f) return
+
+    Log.d(RENDER_TAG, "Applying opacity: $opacity")
+    videoEffects += AlphaScale(opacity)
+}
diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/AudioMixingUtils.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/AudioMixingUtils.kt
new file mode 100644
index 0000000..96548e8
--- /dev/null
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/AudioMixingUtils.kt
@@ -0,0 +1,75 @@
+package ch.waio.pro_video_editor.src.features.render.helpers
+
+import RENDER_TAG
+import androidx.media3.common.audio.AudioProcessor
+import androidx.media3.common.audio.ChannelMixingAudioProcessor
+import androidx.media3.common.audio.ChannelMixingMatrix
+import androidx.media3.common.util.UnstableApi
+import ch.waio.pro_video_editor.src.shared.logging.PluginLog as Log
+
+/**
+ * Utility functions for audio channel mixing and normalization.
+ */
+@UnstableApi
+object AudioMixingUtils {
+
+    /**
+     * Creates a ChannelMixingAudioProcessor configured with standard mixing matrices
+     * to downmix common multi-channel formats to Stereo (2 channels).
+     *
+     * Supports:
+     * - 1 channel (Mono) -> Stereo
+     * - 2 channels (Stereo) -> Stereo (Identity)
+     * - 4 channels (Quad) -> Stereo
+     * - 6 channels (5.1 Surround) -> Stereo (ITU-R BS.775)
+     * - 8 channels (7.1 Surround) -> Stereo
+     */
+    fun createStandardStereoMixer(): ChannelMixingAudioProcessor {
+        val channelMixer = ChannelMixingAudioProcessor()
+        val boost = 1.4f // Slight boost to compensate for downmixing volume loss
+
+        // 8 channels (7.1) -> 2 channels (Stereo)
+        // FL, FR, FC, LFE, BL, BR, SL, SR
+        val eightToTwo = floatArrayOf(
+            1.0f * boost, 0.0f,           // FL -> L, R
+            0.0f, 1.0f * boost,           // FR -> L, R
+            0.707f * boost, 0.707f * boost, // FC -> L, R
+            0.0f, 0.0f,                   // LFE
+            0.707f * boost, 0.0f,         // BL -> L
+            0.0f, 0.707f * boost,         // BR -> R
+            0.707f * boost, 0.0f,         // SL -> L
+            0.0f, 0.707f * boost          // SR -> R
+        )
+        channelMixer.putChannelMixingMatrix(ChannelMixingMatrix(8, 2, eightToTwo))
+
+        // 6 channels (5.1) -> 2 channels (Stereo)
+        // FL, FR, FC, LFE, BL, BR
+        val sixToTwo = floatArrayOf(
+            1.0f * boost, 0.0f,           // FL -> L, R
+            0.0f, 1.0f * boost,           // FR -> L, R
+            0.707f * boost, 0.707f * boost, // FC -> L, R
+            0.0f, 0.0f,                   // LFE
+            0.707f * boost, 0.0f,         // BL -> L
+            0.0f, 0.707f * boost          // BR -> R
+        )
+        channelMixer.putChannelMixingMatrix(ChannelMixingMatrix(6, 2, sixToTwo))
+
+        // 4 channels (Quad) -> 2 channels (Stereo)
+        // FL, FR, BL, BR
+        val fourToTwo = floatArrayOf(
+            1.0f, 0.0f,                   // FL -> L
+            0.0f, 1.0f,                   // FR -> R
+            0.707f, 0.0f,                 // BL -> L
+            0.0f, 0.707f                  // BR -> R
+        )
+        channelMixer.putChannelMixingMatrix(ChannelMixingMatrix(4, 2, fourToTwo))
+
+        // 2 channels -> 2 channels (Stereo passthrough)
+        channelMixer.putChannelMixingMatrix(ChannelMixingMatrix.createForConstantGain(2, 2))
+
+        // 1 channel (Mono) -> 2 channels (Stereo)
+        channelMixer.putChannelMixingMatrix(ChannelMixingMatrix.createForConstantGain(1, 2))
+
+        return channelMixer
+    }
+}
diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/AudioSequenceBuilder.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/AudioSequenceBuilder.kt
index 8a72386..a1ec32c 100644
--- a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/AudioSequenceBuilder.kt
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/AudioSequenceBuilder.kt
@@ -2,6 +2,7 @@ package ch.waio.pro_video_editor.src.features.render.helpers
 
 import RENDER_TAG
 import android.net.Uri
+import androidx.media3.common.C
 import androidx.media3.common.MediaItem
 import androidx.media3.common.audio.AudioProcessor
 import androidx.media3.common.audio.ChannelMixingAudioProcessor
@@ -143,51 +144,45 @@ class AudioSequenceBuilder(
         val compEnd = compositionEndTimeUs ?: videoDurationUs
         val targetDurationUs = (compEnd - compStart).coerceAtLeast(0L)
 
-        // Build audio effects
-        val audioProcessors = buildAudioProcessors()
-        val audioEffects = Effects(audioProcessors, emptyList())
-
-        // Create audio content items with looping or single play
+        // Create audio content items with looping or single play.
+        // NOTE: AudioProcessor instances cannot be shared across multiple EditedMediaItems.
+        // We create fresh effects for each item inside the creation methods.
         val audioContentItems = if (loopAudio) {
             createLoopedAudioItems(
                 audioFile,
                 sourceEndUs,
                 effectiveAudioDurationUs,
-                targetDurationUs,
-                audioEffects
+                targetDurationUs
             )
         } else {
-            createSingleAudioItem(audioFile, sourceEndUs, effectiveAudioDurationUs, targetDurationUs, audioEffects)
+            createSingleAudioItem(audioFile, sourceEndUs, effectiveAudioDurationUs, targetDurationUs)
         }
 
-        val allItems = mutableListOf<EditedMediaItem>()
+        // Build audio sequence using addGap() for leading and trailing silence.
+        // This is more efficient than generating temporary silent WAV files
+        // and avoids potential NPEs with empty MediaItems.
+        val trackTypes = setOf(@C.TrackType C.TRACK_TYPE_AUDIO)
+        val sequenceBuilder = EditedMediaItemSequence.Builder(trackTypes)
 
         // Add leading silence if audio starts after composition time 0.
-        // Media3 parallel sequences always start at time 0, so we need
-        // silence padding to offset the audio to the correct position.
         if (compStart > 0) {
-            val silentItem = createSilentAudioItem(compStart, audioEffects)
-            if (silentItem != null) {
-                allItems.add(silentItem)
-                Log.d(RENDER_TAG, "Added ${compStart / 1000}ms leading silence for composition offset")
-            }
+            sequenceBuilder.addGap(compStart)
+            Log.d(RENDER_TAG, "Added ${compStart / 1000}ms leading gap for composition offset")
         }
 
-        allItems.addAll(audioContentItems)
+        for (item in audioContentItems) {
+            sequenceBuilder.addItem(item)
+        }
 
         // Add trailing silence so the sequence spans the full video duration.
-        // This ensures all parallel sequences have matching lengths.
         val totalContentDurationUs = compStart + targetDurationUs
         if (totalContentDurationUs < videoDurationUs) {
             val trailingDurationUs = videoDurationUs - totalContentDurationUs
-            val silentItem = createSilentAudioItem(trailingDurationUs, audioEffects)
-            if (silentItem != null) {
-                allItems.add(silentItem)
-                Log.d(RENDER_TAG, "Added ${trailingDurationUs / 1000}ms trailing silence")
-            }
+            sequenceBuilder.addGap(trailingDurationUs)
+            Log.d(RENDER_TAG, "Added ${trailingDurationUs / 1000}ms trailing gap")
         }
 
-        return EditedMediaItemSequence.Builder(allItems).build()
+        return sequenceBuilder.build()
     }
 
     /**
@@ -200,48 +195,7 @@ class AudioSequenceBuilder(
 
         // Add channel mixing if needed
         if (needsNormalization) {
-            val channelMixer = ChannelMixingAudioProcessor()
-
-            // 7.1 Surround (8 channels) to Stereo (2 channels)
-            // Channel order: FL, FR, FC, LFE, BL, BR, SL, SR
-            val eightToTwo = floatArrayOf(
-                1.0f, 0.0f, 0.707f, 0.0f, 0.707f, 0.0f, 0.707f, 0.0f,  // Left output
-                0.0f, 1.0f, 0.707f, 0.0f, 0.0f, 0.707f, 0.0f, 0.707f   // Right output
-            )
-            channelMixer.putChannelMixingMatrix(
-                ChannelMixingMatrix(8, 2, eightToTwo)
-            )
-
-            // 5.1 Surround (6 channels) to Stereo (2 channels)
-            // ITU-R BS.775 standard
-            val sixToTwo = floatArrayOf(
-                1.0f, 0.0f, 0.707f, 0.0f, 0.707f, 0.0f,  // Left output
-                0.0f, 1.0f, 0.707f, 0.0f, 0.0f, 0.707f   // Right output
-            )
-            channelMixer.putChannelMixingMatrix(
-                ChannelMixingMatrix(6, 2, sixToTwo)
-            )
-
-            // Quad (4 channels) to Stereo (2 channels)
-            val fourToTwo = floatArrayOf(
-                1.0f, 0.0f, 0.707f, 0.0f,  // Left output
-                0.0f, 1.0f, 0.0f, 0.707f   // Right output
-            )
-            channelMixer.putChannelMixingMatrix(
-                ChannelMixingMatrix(4, 2, fourToTwo)
-            )
-
-            // Stereo (2 channels) to Stereo (2 channels) - passthrough
-            channelMixer.putChannelMixingMatrix(
-                ChannelMixingMatrix.createForConstantGain(2, 2)
-            )
-
-            // Mono (1 channel) to Stereo (2 channels)
-            channelMixer.putChannelMixingMatrix(
-                ChannelMixingMatrix.createForConstantGain(1, 2)
-            )
-
-            processors.add(channelMixer)
+            processors.add(AudioMixingUtils.createStandardStereoMixer())
             Log.d(RENDER_TAG, "Added channel normalization for custom audio")
         }
 
@@ -268,14 +222,13 @@ class AudioSequenceBuilder(
         audioFile: File,
         sourceEndUs: Long,
         effectiveAudioDurationUs: Long,
-        targetDurationUs: Long,
-        effects: Effects
+        targetDurationUs: Long
     ): List<EditedMediaItem> {
         val audioItems = mutableListOf<EditedMediaItem>()
 
         if (effectiveAudioDurationUs <= 0 || targetDurationUs <= 0) {
             // Fallback: add audio once without duration constraints
-            val audioItem = createAudioItem(audioFile, startTimeUs, null, effects)
+            val audioItem = createAudioItem(audioFile, startTimeUs, null, Effects(buildAudioProcessors(), emptyList()))
             audioItems.add(audioItem)
             return audioItems
         }
@@ -308,7 +261,7 @@ class AudioSequenceBuilder(
                 if (audioEndTimeUs != null) loopEndUs else null
             }
 
-            val audioItem = createAudioItem(audioFile, loopStartUs, endPositionUs, effects)
+            val audioItem = createAudioItem(audioFile, loopStartUs, endPositionUs, Effects(buildAudioProcessors(), emptyList()))
             audioItems.add(audioItem)
             remainingDurationUs -= loopAudioDurationUs
             isFirstLoop = false
@@ -325,8 +278,7 @@ class AudioSequenceBuilder(
         audioFile: File,
         sourceEndUs: Long,
         effectiveAudioDurationUs: Long,
-        targetDurationUs: Long,
-        effects: Effects
+        targetDurationUs: Long
     ): List<EditedMediaItem> {
         val endPositionUs = if (effectiveAudioDurationUs > targetDurationUs && targetDurationUs > 0) {
             Log.d(RENDER_TAG, "Trimming audio to ${targetDurationUs / 1000} ms (no loop)")
@@ -344,7 +296,7 @@ class AudioSequenceBuilder(
             )
             null
         }
-        return listOf(createAudioItem(audioFile, startTimeUs, endPositionUs, effects))
+        return listOf(createAudioItem(audioFile, startTimeUs, endPositionUs, Effects(buildAudioProcessors(), emptyList())))
     }
 
     /**
@@ -376,86 +328,4 @@ class AudioSequenceBuilder(
             .build()
     }
 
-    /**
-     * Creates a silent audio EditedMediaItem of the specified duration.
-     *
-     * Media3 parallel sequences always start at time 0, so we use silence
-     * to offset audio to the correct composition position.
-     */
-    private fun createSilentAudioItem(durationUs: Long, effects: Effects): EditedMediaItem? {
-        if (durationUs <= 0) return null
-
-        val silentFile = generateSilentWavFile(durationUs)
-        if (silentFile == null) {
-            Log.e(RENDER_TAG, "Failed to create silent audio item")
-            return null
-        }
-
-        val mediaItem = MediaItem.Builder().setUri(Uri.fromFile(silentFile)).build()
-        return EditedMediaItem.Builder(mediaItem)
-            .setRemoveVideo(true)
-            .setEffects(effects)
-            .build()
-    }
-
-    /**
-     * Generates a temporary WAV file containing silence of the specified duration.
-     *
-     * Creates a valid PCM WAV file with stereo 44100Hz 16-bit silence.
-     */
-    private fun generateSilentWavFile(durationUs: Long): File? {
-        try {
-            val sampleRate = 44100
-            val channels = 2
-            val bitsPerSample = 16
-            val bytesPerSample = bitsPerSample / 8
-            val numSamples = (sampleRate * durationUs / 1_000_000.0).toInt()
-            val dataSize = numSamples * channels * bytesPerSample
-            val fileSize = 36 + dataSize
-
-            val file = File.createTempFile("silence_", ".wav")
-            file.deleteOnExit()
-
-            file.outputStream().use { out ->
-                // RIFF header
-                out.write("RIFF".toByteArray(Charsets.US_ASCII))
-                out.write(toLittleEndian(fileSize, 4))
-                out.write("WAVE".toByteArray(Charsets.US_ASCII))
-
-                // fmt subchunk
-                out.write("fmt ".toByteArray(Charsets.US_ASCII))
-                out.write(toLittleEndian(16, 4))  // Subchunk1Size (PCM)
-                out.write(toLittleEndian(1, 2))   // AudioFormat (PCM = 1)
-                out.write(toLittleEndian(channels, 2))
-                out.write(toLittleEndian(sampleRate, 4))
-                out.write(toLittleEndian(sampleRate * channels * bytesPerSample, 4))
-                out.write(toLittleEndian(channels * bytesPerSample, 2))
-                out.write(toLittleEndian(bitsPerSample, 2))
-
-                // data subchunk
-                out.write("data".toByteArray(Charsets.US_ASCII))
-                out.write(toLittleEndian(dataSize, 4))
-
-                // Write silence (all zeros)
-                val buffer = ByteArray(8192)
-                var remaining = dataSize
-                while (remaining > 0) {
-                    val toWrite = minOf(remaining, buffer.size)
-                    out.write(buffer, 0, toWrite)
-                    remaining -= toWrite
-                }
-            }
-
-            Log.d(RENDER_TAG, "Generated ${durationUs / 1000}ms silent WAV: ${file.absolutePath}")
-            return file
-        } catch (e: Exception) {
-            Log.e(RENDER_TAG, "Failed to generate silent WAV: ${e.message}")
-            return null
-        }
-    }
-
-    /** Converts an integer to little-endian byte array. */
-    private fun toLittleEndian(value: Int, numBytes: Int): ByteArray {
-        return ByteArray(numBytes) { i -> ((value shr (8 * i)) and 0xFF).toByte() }
-    }
 }
diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/CompositionBuilder.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/CompositionBuilder.kt
index a83ae3b..32b02a6 100644
--- a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/CompositionBuilder.kt
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/CompositionBuilder.kt
@@ -2,14 +2,27 @@ package ch.waio.pro_video_editor.src.features.render.helpers
 
 import RENDER_TAG
 import android.content.Context
+import android.graphics.Bitmap
+import android.graphics.Color
+import android.net.Uri
 import androidx.media3.common.Effect
+import androidx.media3.common.MediaItem
 import androidx.media3.common.audio.AudioProcessor
+import androidx.media3.transformer.EditedMediaItem
 import androidx.media3.common.util.UnstableApi
+import androidx.media3.effect.AlphaScale
+import androidx.media3.effect.Presentation
 import androidx.media3.transformer.Composition
 import androidx.media3.transformer.EditedMediaItemSequence
-import ch.waio.pro_video_editor.src.features.render.models.AudioTrackConfig
+import androidx.media3.transformer.Effects
 import ch.waio.pro_video_editor.src.features.render.models.RenderConfig
+import ch.waio.pro_video_editor.src.features.render.models.VideoClip
+import ch.waio.pro_video_editor.src.features.render.utils.getRotatedVideoDimensions
 import ch.waio.pro_video_editor.src.shared.logging.PluginLog as Log
+import java.io.File
+import kotlin.math.max
+import kotlin.math.min
+import androidx.core.graphics.createBitmap
 
 /**
  * Main builder class for creating Media3 Compositions from render configurations.
@@ -21,8 +34,8 @@ import ch.waio.pro_video_editor.src.shared.logging.PluginLog as Log
  */
 @UnstableApi
 class CompositionBuilder(
-    private val context: Context,
-    private val config: RenderConfig
+    private val config: RenderConfig,
+    private val context: Context
 ) {
 
     private var videoEffects: List<Effect> = emptyList()
@@ -58,68 +71,183 @@ class CompositionBuilder(
         Log.d(RENDER_TAG, "Audio enabled: ${config.enableAudio}")
         Log.d(RENDER_TAG, "Audio tracks: ${config.audioTracks.size}")
 
+        // Default render dimensions if not provided
+        var renderWidth = config.renderWidth
+        var renderHeight = config.renderHeight
         val rotationDegrees = (4 - (config.rotateTurns ?: 0)) * 90f
 
+        // If any clip has x, y, width, height, segmentTimeUs, opacity or zIndex, we use multiple sequences.
+        val needsMultipleSequences = config.videoClips.any {
+            it.x != null || it.y != null || it.width != null || it.height != null || 
+            it.segmentTimeUs != null || it.opacity != null || (it.zIndex ?: 0) != 0
+        }
+
+        val hasImageLayers = config.imageLayers.isNotEmpty()
+        if ((renderWidth == null || renderHeight == null) && (needsMultipleSequences || hasImageLayers)) {
+            // Use the first clip for dimensions. 
+            // Prefer explicitly set width/height if available, otherwise fallback to file dimensions.
+            val backgroundClip = config.videoClips.first()
+            if (backgroundClip.width != null && backgroundClip.height != null) {
+                renderWidth = backgroundClip.width.toInt()
+                renderHeight = backgroundClip.height.toInt()
+                Log.d(RENDER_TAG, "Defaulting render dimensions to first clip's size: ${renderWidth}x${renderHeight}")
+            } else {
+                val (w, h, _) = getRotatedVideoDimensions(File(backgroundClip.inputPath), rotationDegrees)
+                renderWidth = w
+                renderHeight = h
+                Log.d(RENDER_TAG, "Defaulting render dimensions to first clip's file: ${renderWidth}x${renderHeight}")
+            }
+        }
+
+        Log.d(RENDER_TAG, "Render dimensions: ${renderWidth}x${renderHeight}")
+
         val hasCustomAudio = config.audioTracks.isNotEmpty()
 
-        // Build video sequence
-        val videoBuilder = VideoSequenceBuilder(config.videoClips)
-            .setVideoEffects(videoEffects)
-            .setAudioEffects(audioEffects)
-            .setRotation(rotationDegrees)
-            .setFlip(config.flipX, config.flipY)
-            .setScale(config.scaleX, config.scaleY)
-            .setCrop(config.cropWidth, config.cropHeight, config.cropX, config.cropY)
-            .setTimedImageLayers(config.imageLayers.map { imageLayer ->
-                VideoSequenceBuilder.ImageLayerConfig(
-                    imageBytes = imageLayer.imageData,
-                    scaleX = config.scaleX,
-                    scaleY = config.scaleY,
-                    withCropping = config.imageBytesWithCropping,
-                    startUs = imageLayer.startUs,
-                    endUs = imageLayer.endUs,
-                    x = imageLayer.x,
-                    y = imageLayer.y,
-                    width = imageLayer.width,
-                    height = imageLayer.height,
-                    animations = imageLayer.animations
-                )
-            })
+        // Detect if audio normalization is needed (check both video and custom audio)
+        // This MUST be done before building sequences so they all use consistent channel counts.
+        // We now rely on pre-transcoding for multi-channel video clips, 
+        // so videoNeedsNormalization will usually be false here.
+        val videoMetadataBuilder = VideoSequenceBuilder(config.videoClips)
             .setEnableAudio(config.enableAudio)
-            .setGlobalTrim(config.startUs, config.endUs)
-            .setHasCustomAudio(hasCustomAudio)
+        val videoNeedsNormalization = videoMetadataBuilder.detectAudioNormalizationNeeded()
+        val needsNormalization = videoNeedsNormalization || hasCustomAudio
 
-        // Detect if audio normalization is needed (check both video and custom audio)
-        val needsNormalization = videoBuilder.detectAudioNormalizationNeeded() || hasCustomAudio
-        videoBuilder.setAudioNormalization(needsNormalization)
+        // 1. Calculate total duration of the entire composition.
+        // Even in complex compositions, some clips might not have segmentTimeUs,
+        // in which case they should follow the previous clip in the input list.
+        var totalDurationUs = 0L
+        var runningSequentialTimeUs = 0L
+        
+        // Map to store calculated start/end times for each clip by its identity (original index)
+        val clipTimings = mutableMapOf<Int, Pair<Long, Long>>()
 
-        // Video keeps its audio - Media3 will mix it natively with custom audio sequence
-        videoBuilder.setForceRemoveAudio(false)
+        for ((index, clip) in config.videoClips.withIndex()) {
+            val clipDurationUs = when {
+                clip.endUs != null -> clip.endUs - (clip.startUs ?: 0L)
+                else -> MediaInfoExtractor.getVideoDuration(clip.inputPath) - (clip.startUs ?: 0L)
+            }
 
-        // Build video sequence (with audio intact)
-        val videoSequence = videoBuilder.build()
+            val clipStartInComposition = if (needsMultipleSequences) {
+                clip.segmentTimeUs ?: runningSequentialTimeUs
+            } else {
+                runningSequentialTimeUs
+            }
+            
+            val clipEndInComposition = clipStartInComposition + clipDurationUs
+            clipTimings[index] = Pair(clipStartInComposition, clipEndInComposition)
+            
+            totalDurationUs = max(totalDurationUs, clipEndInComposition)
+            
+            // Sequential time only increments if we are NOT using explicit segment timing,
+            // or if we are in sequential mode.
+            if (!needsMultipleSequences || clip.segmentTimeUs == null) {
+                runningSequentialTimeUs = clipEndInComposition
+            }
+        }
+        
+        // Calculate global timing
+        val globalStartUs = config.startUs ?: 0L
+        val globalEndUs = config.endUs ?: totalDurationUs
+        val globalDurationUs = globalEndUs - globalStartUs
+        
+        Log.d(RENDER_TAG, "Total composition duration: ${totalDurationUs / 1000}ms")
+        Log.d(RENDER_TAG, "Global trim: ${globalStartUs / 1000}ms to ${globalEndUs / 1000}ms (duration: ${globalDurationUs / 1000}ms)")
 
-        // Prepare sequences list
         val sequences = mutableListOf<EditedMediaItemSequence>()
-        sequences.add(videoSequence)
-        Log.d(
-            RENDER_TAG,
-            "Created video EditedMediaItemSequence with ${config.videoClips.size} items"
-        )
+
+        if (needsMultipleSequences) {
+            Log.d(RENDER_TAG, "Complex composition detected, building multiple video sequences")
+
+            // 3. Sort clips for correct layering based on zIndex and original order.
+            // Rules:
+            // 1. Higher zIndex on top.
+            // 2. Default zIndex is 0.
+            // 3. If zIndex is same, latter segment in input list is on top.
+            //
+            // In Media3, the first sequence in the list is the BOTTOM-MOST layer (Index 0).
+            // By using a stable ascending sort, we satisfy the rules.
+            val indexedClips = config.videoClips.mapIndexed { index, clip -> index to clip }
+            val sortedIndexedClips = indexedClips.sortedByDescending { it.second.zIndex ?: 0 }
+
+            Log.d(RENDER_TAG, "Sorted clips for composition (top to bottom):")
+            for ((index, clip) in sortedIndexedClips) {
+                Log.d(RENDER_TAG, "  Sequence ${index + 1}: path=${clip.inputPath}, zIndex=${clip.zIndex ?: 0}")
+
+                val (clipStartUs, clipEndUs) = clipTimings[index]!!
+
+                // Check if clip overlaps with global trim range
+                if (clipEndUs <= globalStartUs || clipStartUs >= globalEndUs) {
+                    Log.d(RENDER_TAG, "Skipping clip outside global trim: ${clip.inputPath}")
+                    continue
+                }
+
+                // Adjust clip boundaries and calculate leading gap
+                val adjustedStartInComposition = max(clipStartUs, globalStartUs)
+                val adjustedEndInComposition = min(clipEndUs, globalEndUs)
+                val leadingGapUs = adjustedStartInComposition - globalStartUs
+                
+                // Adjust trim relative to source
+                var clipTrimStartUs = clip.startUs ?: 0L
+                if (clipStartUs < globalStartUs) {
+                    clipTrimStartUs += (globalStartUs - clipStartUs)
+                }
+                val clipTrimEndUs = clipTrimStartUs + (adjustedEndInComposition - adjustedStartInComposition)
+
+                // Build sequence with pre-trimmed clip
+                val trimmedClip = clip.copy(startUs = clipTrimStartUs, endUs = clipTrimEndUs)
+                val videoBuilder = VideoSequenceBuilder(listOf(trimmedClip))
+                    .setVideoEffects(emptyList())
+                    .setAudioEffects(emptyList())
+                    .setRotation(rotationDegrees)
+                    .setFlip(config.flipX, config.flipY)
+                    .setScale(config.scaleX, config.scaleY)
+                    .setCrop(config.cropWidth, config.cropHeight, config.cropX, config.cropY)
+                    .setEnableAudio(config.enableAudio && (clip.volume ?: 1.0f) > 0 && !hasCustomAudio)
+                    .setHasCustomAudio(hasCustomAudio)
+                    .setForceRemoveAudio(false)
+                    .setRenderDimensions(renderWidth, renderHeight)
+                    // Ensure consistency across multiple sequences in complex compositions.
+                    .setAudioNormalization(needsNormalization)
+                
+                val baseSequence = videoBuilder.build()
+                val sequenceBuilder = EditedMediaItemSequence.Builder(baseSequence.trackTypes)
+
+                // Prepend leading gap relative to globalStartUs
+                if (leadingGapUs > 0) {
+                    sequenceBuilder.addItem(createTransparentGapItem(leadingGapUs, renderWidth, renderHeight))
+                }
+                
+                sequenceBuilder.addItems(baseSequence.editedMediaItems)
+
+                // Pad remaining duration to prevent frozen frames
+                val sequenceDurationUs = leadingGapUs + (adjustedEndInComposition - adjustedStartInComposition)
+                if (sequenceDurationUs < globalDurationUs) {
+                    sequenceBuilder.addItem(createTransparentGapItem(globalDurationUs - sequenceDurationUs, renderWidth, renderHeight))
+                }
+                
+                sequences.add(sequenceBuilder.build())
+            }
+        } else {
+            // Build single optimized video sequence
+            val videoBuilder = createVideoBuilder(config.videoClips, rotationDegrees, hasCustomAudio)
+                .setVideoEffects(emptyList())
+                .setAudioEffects(emptyList())
+                .setRenderDimensions(renderWidth, renderHeight)
+                .setAudioNormalization(needsNormalization)
+            sequences.add(videoBuilder.build())
+        }
 
         // Add audio tracks as separate sequences - Media3 will mix all tracks natively
         if (hasCustomAudio) {
-            val totalVideoDuration = videoBuilder.calculateTotalDuration()
-
             for ((index, track) in config.audioTracks.withIndex()) {
                 Log.d(
                     RENDER_TAG,
                     "🎵 Adding audio track $index: path=${track.path}, volume=${track.volume}, loop=${track.loop}"
                 )
 
-                val audioSequence = AudioSequenceBuilder(track.path, totalVideoDuration)
+                val audioSequence = AudioSequenceBuilder(track.path, globalDurationUs)
                     .setVolume(track.volume)
-                    .setNormalization(needsNormalization)
+                    .setNormalization(false) // Custom tracks are usually stereo; allow native resampling
                     .setLoop(track.loop)
                     .setStartTime(track.audioStartUs)
                     .setAudioEndTime(track.audioEndUs)
@@ -135,9 +263,109 @@ class CompositionBuilder(
         }
 
         // Build final composition
-        val composition = Composition.Builder(sequences).build()
+        val compositionBuilder = Composition.Builder(sequences.toList())
+        
+        // Add Global effects and Presentation effect.
+        // Moving effects to Composition level ensures they apply to the final combined video.
+        val combinedVideoEffects = mutableListOf<Effect>()
+        combinedVideoEffects.addAll(videoEffects)
+
+        // Apply Image Layers at the Composition level so they are truly global
+        if (config.imageLayers.isNotEmpty() && renderWidth != null && renderHeight != null) {
+            applyTimedImageLayers(
+                combinedVideoEffects, 
+                config.imageLayers, 
+                renderWidth, 
+                renderHeight
+            )
+        }
+
+        if (renderWidth != null && renderHeight != null) {
+            combinedVideoEffects += Presentation.createForWidthAndHeight(
+                renderWidth,
+                renderHeight,
+                Presentation.LAYOUT_SCALE_TO_FIT
+            )
+            Log.d(RENDER_TAG, "Global Presentation effect applied: ${renderWidth}x${renderHeight}")
+        }
+
+        // Prepare global audio effects
+        val finalAudioEffects = mutableListOf<AudioProcessor>()
+        finalAudioEffects.addAll(audioEffects)
+
+        compositionBuilder.setEffects(Effects(finalAudioEffects, combinedVideoEffects))
+
+        val composition = compositionBuilder.build()
         Log.d(RENDER_TAG, "Composition created successfully with ${sequences.size} sequences")
 
         return composition
     }
-}
\ No newline at end of file
+
+    /**
+     * Cleans up temporary resources used during composition building.
+     */
+    fun release() {
+        // No temporary files to cleanup in current implementation
+    }
+
+    private fun createTransparentGapItem(durationUs: Long, renderWidth: Int?, renderHeight: Int?): EditedMediaItem {
+        val gapFile = File(context.cacheDir, "pve_transparent_gap.png")
+        if (!gapFile.exists()) {
+            try {
+                val bitmap = createBitmap(1, 1)
+                bitmap.eraseColor(Color.TRANSPARENT)
+                gapFile.outputStream().use {
+                    bitmap.compress(Bitmap.CompressFormat.PNG, 100, it) 
+                }
+                Log.d(RENDER_TAG, "Created transparent gap PNG at: ${gapFile.absolutePath}")
+            } catch (e: Exception) {
+                Log.e(RENDER_TAG, "Failed to create transparent gap PNG: ${e.message}")
+            }
+        }
+
+        val mediaItem = MediaItem.Builder()
+            .setUri(Uri.fromFile(gapFile))
+            .setImageDurationMs(maxOf(1, (durationUs + 999) / 1000))
+            .build()
+
+        val videoEffects = mutableListOf<Effect>()
+        videoEffects.add(AlphaScale(0f))
+        
+        // Move the gap item off-screen to ensure it doesn't obscure anything even if transparency fails
+        if (renderWidth != null && renderHeight != null) {
+            videoEffects.add(VideoCompositionTransformation(
+                x = -100.0, // Off-screen
+                y = -100.0,
+                width = 1.0,
+                height = 1.0,
+                videoWidth = 1,
+                videoHeight = 1,
+                renderWidth = renderWidth,
+                renderHeight = renderHeight
+            ))
+        }
+
+        return EditedMediaItem.Builder(mediaItem)
+            .setFrameRate(30)
+            .setEffects(Effects(emptyList(), videoEffects))
+            .build()
+    }
+
+    private fun createVideoBuilder(
+        clips: List<VideoClip>,
+        rotationDegrees: Float,
+        hasCustomAudio: Boolean
+    ): VideoSequenceBuilder {
+        return VideoSequenceBuilder(clips)
+            .setVideoEffects(videoEffects)
+            .setAudioEffects(audioEffects)
+            .setRotation(rotationDegrees)
+            .setFlip(config.flipX, config.flipY)
+            .setScale(config.scaleX, config.scaleY)
+            .setCrop(config.cropWidth, config.cropHeight, config.cropX, config.cropY)
+            .setEnableAudio(config.enableAudio)
+            .setGlobalTrim(config.startUs, config.endUs)
+            .setHasCustomAudio(hasCustomAudio)
+            .setForceRemoveAudio(false)
+    }
+}
diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VideoCompositionTransformation.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VideoCompositionTransformation.kt
new file mode 100644
index 0000000..b59bee4
--- /dev/null
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VideoCompositionTransformation.kt
@@ -0,0 +1,136 @@
+package ch.waio.pro_video_editor.src.features.render.helpers
+
+import android.content.Context
+import android.opengl.GLES20
+import android.opengl.Matrix
+import androidx.media3.common.VideoFrameProcessingException
+import androidx.media3.common.util.GlProgram
+import androidx.media3.common.util.GlUtil
+import androidx.media3.common.util.Size
+import androidx.media3.common.util.UnstableApi
+import androidx.media3.effect.BaseGlShaderProgram
+import androidx.media3.effect.GlEffect
+import androidx.media3.effect.GlShaderProgram
+
+/**
+ * A GlEffect that handles positioning and scaling of a video segment
+ * within a larger render canvas.
+ *
+ * It converts pixel-based offsets and sizes from the Flutter side into
+ * the normalized OpenGL coordinates used by Media3 effects.
+ */
+@UnstableApi
+class VideoCompositionTransformation(
+    private val x: Double?,
+    private val y: Double?,
+    private val width: Double?,
+    private val height: Double?,
+    private val videoWidth: Int,
+    private val videoHeight: Int,
+    private val renderWidth: Int,
+    private val renderHeight: Int
+) : GlEffect {
+
+    override fun toGlShaderProgram(context: Context, useHdr: Boolean): GlShaderProgram {
+        return VideoCompositionShaderProgram(context, useHdr, this)
+    }
+
+    @UnstableApi
+    private class VideoCompositionShaderProgram(
+        context: Context,
+        useHdr: Boolean,
+        private val effect: VideoCompositionTransformation
+    ) : BaseGlShaderProgram(useHdr, /* texturePoolCapacity= */ 1) {
+
+        private val glProgram: GlProgram
+
+        companion object {
+            private const val VERTEX_SHADER_SOURCE =
+                "attribute vec4 aFramePosition;\n" +
+                "attribute vec4 aTexSamplingCoord;\n" +
+                "varying vec2 vTexSamplingCoord;\n" +
+                "uniform mat4 uTransformationMatrix;\n" +
+                "void main() {\n" +
+                "  gl_Position = uTransformationMatrix * aFramePosition;\n" +
+                "  vTexSamplingCoord = aTexSamplingCoord.xy;\n" +
+                "}"
+
+            private const val FRAGMENT_SHADER_SOURCE =
+                "precision mediump float;\n" +
+                "uniform sampler2D uTexSampler;\n" +
+                "varying vec2 vTexSamplingCoord;\n" +
+                "void main() {\n" +
+                "  gl_FragColor = texture2D(uTexSampler, vTexSamplingCoord);\n" +
+                "}"
+        }
+
+        init {
+            try {
+                glProgram = GlProgram(VERTEX_SHADER_SOURCE, FRAGMENT_SHADER_SOURCE)
+            } catch (e: Exception) {
+                throw VideoFrameProcessingException(e)
+            }
+        }
+
+        override fun configure(inputWidth: Int, inputHeight: Int): Size {
+            return Size(effect.renderWidth, effect.renderHeight)
+        }
+
+        override fun drawFrame(inputTexId: Int, presentationTimeUs: Long) {
+            try {
+                glProgram.use()
+                
+                // Clear the target framebuffer to transparent before drawing the segment.
+                // This ensures that segments that don't cover the full canvas don't show garbage.
+                GLES20.glClearColor(0f, 0f, 0f, 0f)
+                GLES20.glClear(GLES20.GL_COLOR_BUFFER_BIT)
+                
+                // Enable alpha blending to support transparent layers and overlays.
+                GLES20.glEnable(GLES20.GL_BLEND)
+                GLES20.glBlendFunc(GLES20.GL_SRC_ALPHA, GLES20.GL_ONE_MINUS_SRC_ALPHA)
+
+                val glMatrix = FloatArray(16)
+                Matrix.setIdentityM(glMatrix, 0)
+
+                val targetWidth = (effect.width ?: effect.videoWidth.toDouble()).toFloat()
+                val targetHeight = (effect.height ?: effect.videoHeight.toDouble()).toFloat()
+                
+                // sx and sy are half-widths in NDC (relative to a 2.0 wide NDC space)
+                val sx = if (effect.renderWidth > 0) targetWidth / effect.renderWidth else 1.0f
+                val sy = if (effect.renderHeight > 0) targetHeight / effect.renderHeight else 1.0f
+
+                // Convert pixel (x, y) to NDC top-left
+                val leftNDC = if (effect.renderWidth > 0) (2f * (effect.x ?: 0.0).toFloat() / effect.renderWidth) - 1f else -1.0f
+                val topNDC = if (effect.renderHeight > 0) 1f - (2f * (effect.y ?: 0.0).toFloat() / effect.renderHeight) else 1.0f
+
+                // Target center in NDC for a quad that is 2x2 centered at 0,0
+                val centerX = leftNDC + sx
+                val centerY = topNDC - sy
+
+                Matrix.translateM(glMatrix, 0, centerX, centerY, 0f)
+                Matrix.scaleM(glMatrix, 0, sx, sy, 1f)
+
+                glProgram.setFloatsUniform("uTransformationMatrix", glMatrix)
+                glProgram.setSamplerTexIdUniform("uTexSampler", inputTexId, 0)
+                
+                // Set attribute buffers with robust size detection
+                val vertexData = GlUtil.getNormalizedCoordinateBounds()
+                val vertexSize = if (vertexData.size == 8) 2 else 4
+                glProgram.setBufferAttribute("aFramePosition", vertexData, vertexSize)
+
+                val texData = GlUtil.getTextureCoordinateBounds()
+                val texSize = if (texData.size == 8) 2 else 4
+                glProgram.setBufferAttribute("aTexSamplingCoord", texData, texSize)
+                
+                glProgram.bindAttributesAndUniforms()
+                
+                GLES20.glDrawArrays(GLES20.GL_TRIANGLE_STRIP, 0, 4)
+                
+                GLES20.glDisable(GLES20.GL_BLEND)
+                GlUtil.checkGlError()
+            } catch (e: Exception) {
+                throw VideoFrameProcessingException(e, presentationTimeUs)
+            }
+        }
+    }
+}
diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VideoSequenceBuilder.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VideoSequenceBuilder.kt
index 8193e0c..15305f9 100644
--- a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VideoSequenceBuilder.kt
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VideoSequenceBuilder.kt
@@ -3,6 +3,7 @@ package ch.waio.pro_video_editor.src.features.render.helpers
 import RENDER_TAG
 import android.net.Uri
 import applyScale
+import applyRotation
 import androidx.media3.common.C
 import androidx.media3.common.Effect
 import androidx.media3.common.MediaItem
@@ -35,7 +36,6 @@ class VideoSequenceBuilder(
     private var flipX: Boolean = false
     private var flipY: Boolean = false
     private var cropConfig: CropConfig? = null
-    private var timedImageLayers: List<ImageLayerConfig> = emptyList()
     private var enableAudio: Boolean = true
     private var needsAudioNormalization: Boolean = false
     private var forceRemoveAudio: Boolean = false
@@ -44,6 +44,8 @@ class VideoSequenceBuilder(
     private var hasCustomAudio: Boolean = false
     private var scaleX: Float? = null
     private var scaleY: Float? = null
+    private var renderWidth: Int? = null
+    private var renderHeight: Int? = null
 
     data class CropConfig(
         val width: Int?,
@@ -52,19 +54,14 @@ class VideoSequenceBuilder(
         val y: Int?
     )
 
-    data class ImageLayerConfig(
-        val imageBytes: ByteArray?,
-        val scaleX: Float?,
-        val scaleY: Float?,
-        val withCropping: Boolean = false,
-        val startUs: Long = 0,
-        val endUs: Long = -1,
-        val x: Int? = null,
-        val y: Int? = null,
-        val width: Double? = null,
-        val height: Double? = null,
-        val animations: List<LayerAnimationConfig> = emptyList()
-    )
+    /**
+     * Sets target render dimensions for composition-based positioning.
+     */
+    fun setRenderDimensions(width: Int?, height: Int?): VideoSequenceBuilder {
+        this.renderWidth = width
+        this.renderHeight = height
+        return this
+    }
 
     /**
      * Sets the video effects to apply to all clips.
@@ -116,14 +113,6 @@ class VideoSequenceBuilder(
         return this
     }
 
-    /**
-     * Sets time-based image layer overlays configuration.
-     */
-    fun setTimedImageLayers(layers: List<ImageLayerConfig>): VideoSequenceBuilder {
-        this.timedImageLayers = layers
-        return this
-    }
-
     /**
      * Enables or disables audio in the output.
      */
@@ -179,10 +168,17 @@ class VideoSequenceBuilder(
     /**
      * Detects if audio normalization is needed across video clips.
      *
-     * @return true if clips have different audio channel counts
+     * Only considers clips that have audio enabled (not muted).
+     *
+     * @return true if any active clip has > 2 channels (needs downmixing)
+     */
+    /**
+     * Detects if audio normalization is needed across video clips.
+     *
+     * @return true if any clip has non-stereo audio (needs downmixing)
      */
     fun detectAudioNormalizationNeeded(): Boolean {
-        if (!enableAudio || videoClips.size <= 1) {
+        if (!enableAudio) {
             return false
         }
 
@@ -190,18 +186,15 @@ class VideoSequenceBuilder(
             MediaInfoExtractor.getAudioChannelCount(clip.inputPath)
         }
 
-        val needsNormalization = audioChannelCounts.isNotEmpty() &&
-                audioChannelCounts.toSet().size > 1
+        // Normalize if any clip is NOT Stereo (2 channels).
+        // This includes Mono (1 channel) and Multi-channel (5.1/7.1).
+        // Forcing Stereo consistency prevents reconfiguration errors in Media3 AudioGraph.
+        val needsNormalization = audioChannelCounts.any { it != 2 }
 
         if (needsNormalization) {
             Log.d(
                 RENDER_TAG,
-                "Audio normalization needed - detected different channel counts: $audioChannelCounts"
-            )
-        } else if (audioChannelCounts.isNotEmpty()) {
-            Log.d(
-                RENDER_TAG,
-                "Audio normalization NOT needed - all videos have same channel count: ${audioChannelCounts.firstOrNull()}"
+                "Audio normalization needed - non-stereo audio detected: $audioChannelCounts"
             )
         }
 
@@ -225,9 +218,8 @@ class VideoSequenceBuilder(
         var totalDurationUs = 0L
         trimmedClips.forEach { clip ->
             val clipDurationUs = when {
-                clip.endUs != null && clip.startUs != null -> clip.endUs - clip.startUs
-                clip.endUs != null -> clip.endUs
-                else -> MediaInfoExtractor.getVideoDuration(clip.inputPath)
+                clip.startUs != null -> MediaInfoExtractor.getVideoDuration(clip.inputPath) - clip.startUs
+                else -> MediaInfoExtractor.getVideoDuration(clip.inputPath) - (clip.startUs ?: 0L)
             }
             totalDurationUs += clipDurationUs
         }
@@ -248,17 +240,17 @@ class VideoSequenceBuilder(
         val trimmedClips = applyGlobalTrim(videoClips)
         Log.d(RENDER_TAG, "After global trim: ${trimmedClips.size} clips (was ${videoClips.size})")
 
-        // Prepare normalized audio effects with channel mixing if needed
-        val normalizedAudioEffects = if (needsAudioNormalization) {
-            Log.d(RENDER_TAG, "Adding ChannelMixingAudioProcessor to normalize audio to stereo")
-            buildChannelNormalizationEffects()
-        } else {
-            audioEffects.toList()
-        }
-
         // Build EditedMediaItems for each clip
         val editedMediaItems = trimmedClips.mapIndexed { index, clip ->
-            buildEditedMediaItem(index, clip, normalizedAudioEffects)
+            // Audio normalization is now handled primarily via pre-transcoding.
+            // We keep the processor for edge cases where pre-transcoding was skipped.
+            val itemAudioProcessors = mutableListOf<AudioProcessor>()
+            if (needsAudioNormalization) {
+                itemAudioProcessors.add(AudioMixingUtils.createStandardStereoMixer())
+            }
+            itemAudioProcessors.addAll(audioEffects)
+
+            buildEditedMediaItem(index, clip, itemAudioProcessors)
         }
 
         Log.d(RENDER_TAG, "Total EditedMediaItems created: ${editedMediaItems.size}")
@@ -281,8 +273,13 @@ class VideoSequenceBuilder(
 
         // Determine track types for the sequence
         val trackTypes = mutableSetOf<@C.TrackType Int>(C.TRACK_TYPE_VIDEO)
-        if (enableAudio) {
+        
+        // ONLY add audio track type if at least one item provides audio
+        if (enableAudio && finalVideoItems.any { !it.removeAudio }) {
             trackTypes.add(C.TRACK_TYPE_AUDIO)
+            Log.d(RENDER_TAG, "Sequence will include AUDIO track")
+        } else {
+            Log.d(RENDER_TAG, "Sequence will NOT include AUDIO track (muted or disabled)")
         }
 
         return EditedMediaItemSequence.Builder(trackTypes)
@@ -292,93 +289,13 @@ class VideoSequenceBuilder(
     }
 
     /**
-     * Builds channel normalization effects (channel mixer + audio processors).
-     *
-     * Uses boosted ITU-R BS.775 coefficients for multi-channel downmixing.
-     * 
-     * The standard ITU-R BS.775 coefficients (1.0, 0.707, 0.707) cause volume loss
-     * because the energy distributed across multiple channels doesn't fully translate
-     * to stereo. We apply a boost factor of ~1.4 (sqrt(2)) to compensate.
-     * 
-     * This ensures that surround content maintains similar perceived loudness
-     * when mixed with stereo custom audio tracks.
+     * Builds an EditedMediaItem for a single video clip with all effects.
      */
-    private fun buildChannelNormalizationEffects(): List<AudioProcessor> {
-        val channelMixer = ChannelMixingAudioProcessor()
-
-        // Boost factor to compensate for energy loss during downmixing
-        // sqrt(2) ≈ 1.414 compensates for the typical ~70% volume loss
-        val boost = 1.4f
-
-        // 7.1 Surround (8 channels) to Stereo (2 channels)
-        // Channel order: FL, FR, FC, LFE, BL, BR, SL, SR
-        // Boosted coefficients to maintain loudness
-        val eightToTwo = floatArrayOf(
-            1.0f * boost,
-            0.0f,
-            0.707f * boost,
-            0.0f,
-            0.707f * boost,
-            0.0f,
-            0.707f * boost,
-            0.0f,  // Left output
-            0.0f,
-            1.0f * boost,
-            0.707f * boost,
-            0.0f,
-            0.0f,
-            0.707f * boost,
-            0.0f,
-            0.707f * boost   // Right output
-        )
-        channelMixer.putChannelMixingMatrix(
-            ChannelMixingMatrix(8, 2, eightToTwo)
-        )
-
-        // 5.1 Surround (6 channels) to Stereo (2 channels)
-        // Channel order: FL, FR, FC, LFE, BL, BR
-        // Boosted ITU-R BS.775: L' = (L + 0.707*C + 0.707*Ls) * boost
-        val sixToTwo = floatArrayOf(
-            1.0f * boost, 0.0f, 0.707f * boost, 0.0f, 0.707f * boost, 0.0f,  // Left output
-            0.0f, 1.0f * boost, 0.707f * boost, 0.0f, 0.0f, 0.707f * boost   // Right output
-        )
-        channelMixer.putChannelMixingMatrix(
-            ChannelMixingMatrix(6, 2, sixToTwo)
-        )
-
-        // Quad (4 channels) to Stereo (2 channels)
-        // Channel order: FL, FR, BL, BR
-        // Slightly lower boost for quad (less energy distributed)
-        val boostQuad = 1.2f
-        val fourToTwo = floatArrayOf(
-            1.0f * boostQuad, 0.0f, 0.707f * boostQuad, 0.0f,  // Left output
-            0.0f, 1.0f * boostQuad, 0.0f, 0.707f * boostQuad   // Right output
-        )
-        channelMixer.putChannelMixingMatrix(
-            ChannelMixingMatrix(4, 2, fourToTwo)
-        )
-
-        // Stereo (2 channels) to Stereo (2 channels) - passthrough (no boost needed)
-        channelMixer.putChannelMixingMatrix(
-            ChannelMixingMatrix.createForConstantGain(2, 2)
-        )
-
-        // Mono (1 channel) to Stereo (2 channels)
-        channelMixer.putChannelMixingMatrix(
-            ChannelMixingMatrix.createForConstantGain(1, 2)
-        )
-
-        Log.d(
-            RENDER_TAG,
-            "Channel normalization configured with boosted coefficients for loudness preservation"
-        )
-
-        return mutableListOf<AudioProcessor>(channelMixer).apply { addAll(audioEffects) }
+    private fun isImageFile(path: String): Boolean {
+        val extension = path.substringAfterLast('.', "").lowercase()
+        return extension in listOf("jpg", "jpeg", "png", "webp", "heic", "heif")
     }
 
-    /**
-     * Builds an EditedMediaItem for a single video clip with all effects.
-     */
     private fun buildEditedMediaItem(
         index: Int,
         clip: VideoClip,
@@ -389,27 +306,44 @@ class VideoSequenceBuilder(
 
         if (!inputFile.exists()) {
             Log.e(RENDER_TAG, "ERROR: Video file does not exist: ${clip.inputPath}")
-        } else {
-            Log.d(RENDER_TAG, "Video file exists, size: ${inputFile.length()} bytes")
         }
 
         // Build MediaItem with optional trimming
         val mediaItemBuilder = MediaItem.Builder().setUri(Uri.fromFile(inputFile))
 
+        val isImage = isImageFile(clip.inputPath)
+        if (isImage) {
+            val durationUs = when {
+                clip.endUs != null -> clip.endUs - (clip.startUs ?: 0L)
+                else -> MediaInfoExtractor.getVideoDuration(clip.inputPath) - (clip.startUs ?: 0L)
+            }
+            mediaItemBuilder.setImageDurationMs(maxOf(1, durationUs / 1000))
+
+            // Map common extensions to MIME types for Transformer
+            val extension = clip.inputPath.substringAfterLast('.', "").lowercase()
+            val mimeType = when (extension) {
+                "png" -> "image/png"
+                "webp" -> "image/webp"
+                "heic", "heif" -> "image/heif"
+                else -> "image/jpeg"
+            }
+            mediaItemBuilder.setMimeType(mimeType)
+        }
+
         if (clip.startUs != null || clip.endUs != null) {
             val startMs = (clip.startUs ?: 0L) / 1000
-            val endMs = clip.endUs?.div(1000) ?: C.TIME_END_OF_SOURCE
-            val expectedDurationMs = if (clip.endUs != null && clip.startUs != null) {
-                (clip.endUs - clip.startUs) / 1000
-            } else if (clip.endUs != null) {
+            
+            // Explicitly use C.TIME_END_OF_SOURCE only if endUs is null.
+            // If it's provided, ensure it's not accidentally set to Long.MIN_VALUE via overflow/underflow.
+            val endMs = if (clip.endUs != null) {
                 clip.endUs / 1000
             } else {
-                -1L
+                C.TIME_END_OF_SOURCE
             }
 
             Log.d(
                 RENDER_TAG,
-                "Applying trim to clip ${clip.inputPath}: start=$startMs ms, end=$endMs ms, expectedDuration=$expectedDurationMs ms"
+                "Applying trim to clip ${clip.inputPath}: start=$startMs ms, end=$endMs ms"
             )
 
             val clippingConfig = MediaItem.ClippingConfiguration.Builder()
@@ -433,30 +367,16 @@ class VideoSequenceBuilder(
             rotationDegrees
         )
 
+        // Apply rotation early so subsequent effects (Crop, Composition) see correctly oriented frames
+        applyRotation(clipVideoEffects, videoRotation.toFloat())
+
         // Adjust dimensions based on rotation
         val isRotated90Deg = videoRotation == 90 || videoRotation == 270
 
         // If crop is applied, update dimensions for AFTER crop scenario
-        val croppedWidth: Int?
-        val croppedHeight: Int?
         val crop = cropConfig
         if (crop != null) {
-            croppedWidth = if (isRotated90Deg) crop.height else crop.width
-            croppedHeight = if (isRotated90Deg) crop.width else crop.height
-        } else {
-            croppedWidth = null
-            croppedHeight = null
-        }
-
-        // Apply timed image layers BEFORE crop if withCropping is enabled
-        // This makes the images get cropped together with the video
-        val hasWithCropping = timedImageLayers.any { it.withCropping }
-        if (hasWithCropping && timedImageLayers.isNotEmpty()) {
-            applyTimedImageLayers(clipVideoEffects, timedImageLayers, videoWidth, videoHeight)
-        }
-
-        // Apply crop if configured
-        cropConfig?.let { crop ->
+            // Apply crop if configured
             applyCrop(
                 clipVideoEffects,
                 inputFile,
@@ -470,20 +390,34 @@ class VideoSequenceBuilder(
             )
 
             // Update dimensions after crop for image layers applied AFTER crop
+            val croppedWidth: Int? = if (isRotated90Deg) crop.height else crop.width
+            val croppedHeight: Int? = if (isRotated90Deg) crop.width else crop.height
             if (croppedWidth != null) videoWidth = croppedWidth
             if (croppedHeight != null) videoHeight = croppedHeight
         }
 
-        // Apply timed image layers AFTER crop if withCropping is disabled (default)
-        // This makes the images stretch to the final cropped size
-        if (!hasWithCropping && timedImageLayers.isNotEmpty()) {
-            applyTimedImageLayers(clipVideoEffects, timedImageLayers, videoWidth, videoHeight)
+        // Apply composition transformation if render dimensions are set.
+        // This ensures consistent canvas sizing.
+        if (renderWidth != null && renderHeight != null) {
+            clipVideoEffects += VideoCompositionTransformation(
+                x = clip.x,
+                y = clip.y,
+                width = clip.width,
+                height = clip.height,
+                videoWidth = videoWidth,
+                videoHeight = videoHeight,
+                renderWidth = renderWidth!!,
+                renderHeight = renderHeight!!
+            )
         }
 
         // Apply scale AFTER overlay and crop to match the iOS/macOS pipeline.
         // This prevents the overlay from being distorted by a pre-applied scale.
         applyScale(clipVideoEffects, scaleX, scaleY)
 
+        // Apply opacity
+        applyOpacity(clipVideoEffects, clip.opacity)
+
         // Per-clip volume control:
         // - Without custom audio: VolumeAudioProcessor per clip works (single sequence)
         // - With custom audio: AudioProcessors don't work with parallel sequences,
@@ -521,6 +455,11 @@ class VideoSequenceBuilder(
         return EditedMediaItem.Builder(mediaItem)
             .setEffects(effects)
             .setRemoveAudio(shouldRemoveAudio)
+            .apply {
+                if (isImage) {
+                    setFrameRate(30)
+                }
+            }
             .build()
     }
 
@@ -599,7 +538,14 @@ class VideoSequenceBuilder(
                             inputPath = clip.inputPath,
                             startUs = newStartInSource,
                             endUs = newEndInSource,
-                            volume = clip.volume
+                            volume = clip.volume,
+                            x = clip.x,
+                            y = clip.y,
+                            width = clip.width,
+                            height = clip.height,
+                            zIndex = clip.zIndex,
+                            opacity = clip.opacity,
+                            segmentTimeUs = clip.segmentTimeUs
                         )
                     )
                     val trimmedDuration = newEndInSource - newStartInSource
diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VideoTranscoder.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VideoTranscoder.kt
index 84d9a06..b5a2194 100644
--- a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VideoTranscoder.kt
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VideoTranscoder.kt
@@ -15,6 +15,7 @@ import androidx.media3.transformer.ExportException
 import androidx.media3.transformer.ExportResult
 import androidx.media3.transformer.Transformer
 import ch.waio.pro_video_editor.src.shared.logging.PluginLog as Log
+import androidx.media3.transformer.Effects
 import java.io.File
 import java.util.concurrent.CountDownLatch
 import java.util.concurrent.atomic.AtomicReference
@@ -44,19 +45,21 @@ object VideoTranscoder {
     }
 
     /**
-     * Checks if a video needs transcoding for effect compatibility.
+     * Checks if a video needs transcoding for compatibility.
      * 
      * @param videoPath Path to the video file
      * @return True if transcoding is needed
      */
     fun needsTranscoding(videoPath: String): Boolean {
         val formatInfo = MediaInfoExtractor.getVideoFormatInfo(videoPath)
-        val needsTranscode = formatInfo.needsTranscodingForEffects()
+        val audioChannels = MediaInfoExtractor.getAudioChannelCount(videoPath) ?: 2
+        
+        val needsTranscode = formatInfo.needsTranscodingForEffects() || audioChannels > 2
 
         Log.d(
             RENDER_TAG, "Video transcoding check: path=$videoPath, " +
                     "isHevc=${formatInfo.isHevc}, bitDepth=${formatInfo.bitDepth}, " +
-                    "isHdr=${formatInfo.isHdr}, needsTranscoding=$needsTranscode"
+                    "isHdr=${formatInfo.isHdr}, channels=$audioChannels, needsTranscoding=$needsTranscode"
         )
 
         return needsTranscode
@@ -82,7 +85,7 @@ object VideoTranscoder {
             return TranscodeResult.NotNeeded(inputPath)
         }
 
-        Log.i(RENDER_TAG, "Starting HEVC 10-bit HDR -> H.264 8-bit SDR transcoding for: $inputPath")
+        Log.i(RENDER_TAG, "Starting HEVC 10-bit HDR or Multi-channel -> H.264 8-bit Stereo transcoding for: $inputPath")
 
         val outputFile = File(
             context.cacheDir,
@@ -138,10 +141,12 @@ object VideoTranscoder {
                     .build()
 
                 // Use HDR_MODE_TONE_MAP_HDR_TO_SDR_USING_OPEN_GL to convert HDR to SDR
-                // This forces 8-bit output which then allows H.264 encoding
+                // This forces 8-bit output which then allows H.264 encoding.
+                // Also add standard audio normalization to ensure stereo output.
                 val editedMediaItem = EditedMediaItem.Builder(mediaItem)
                     .setRemoveAudio(false)
                     .setRemoveVideo(false)
+                    .setEffects(Effects(listOf(AudioMixingUtils.createStandardStereoMixer()), emptyList()))
                     .build()
 
                 // Build composition with HDR tonemapping enabled
diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VolumeControlAudioMixer.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VolumeControlAudioMixer.kt
index 11af2ee..04d9bb6 100644
--- a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VolumeControlAudioMixer.kt
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/helpers/VolumeControlAudioMixer.kt
@@ -20,24 +20,27 @@ import java.nio.ByteBuffer
  * to each audio source during the mixing process.
  *
  * @property trackVolumes Volume multipliers for each audio track sequence (0.0-1.0+)
- * @property videoAudioPresent Whether video audio is present (not removed due to volume=0)
+ * @property videoAudioSourceCount Number of video sequences that have active audio
+ * @property videoSequenceVolumes Volume multipliers for each video sequence
  */
 @UnstableApi
 class VolumeControlAudioMixerFactory(
     private val trackVolumes: List<Float>,
-    private val videoAudioPresent: Boolean
+    private val videoAudioSourceCount: Int,
+    private val videoSequenceVolumes: List<Float>
 ) : AudioMixer.Factory {
 
     init {
         Log.d(
             RENDER_TAG,
-            "VolumeControlAudioMixerFactory created: trackVolumes=$trackVolumes, videoAudioPresent=$videoAudioPresent"
+            "VolumeControlAudioMixerFactory created: trackVolumes=$trackVolumes, " +
+                    "videoAudioSourceCount=$videoAudioSourceCount, videoSequenceVolumes=$videoSequenceVolumes"
         )
     }
 
     override fun create(): AudioMixer {
         Log.d(RENDER_TAG, "Creating VolumeControlAudioMixer")
-        return VolumeControlAudioMixer(trackVolumes, videoAudioPresent)
+        return VolumeControlAudioMixer(trackVolumes, videoAudioSourceCount, videoSequenceVolumes)
     }
 }
 
@@ -47,17 +50,15 @@ class VolumeControlAudioMixerFactory(
  * When sources are added, it tracks their IDs and applies the appropriate volume
  * using DefaultAudioMixer.setSourceVolume() after each source is added.
  *
- * If videoAudioPresent is true (mixing video + audio tracks):
- *   Source 0 = Video audio (first sequence) - volume 1.0 (per-clip volume not available in mixer mode)
- *   Source 1..N = Audio tracks - applies trackVolumes[0], trackVolumes[1], etc.
- * 
- * If videoAudioPresent is false (no video audio):
- *   Source 0..N = Audio tracks - applies trackVolumes[0], trackVolumes[1], etc.
+ * Source order in Media3 Composition (Mixed):
+ *   Source 0..N-1 = Video audio (from each sequence that has an AUDIO track)
+ *   Source N..M = Audio tracks
  */
 @UnstableApi
 private class VolumeControlAudioMixer(
     private val trackVolumes: List<Float>,
-    private val videoAudioPresent: Boolean
+    private val videoAudioSourceCount: Int,
+    private val videoSequenceVolumes: List<Float>
 ) : AudioMixer {
 
     private val delegate: DefaultAudioMixer =
@@ -90,26 +91,19 @@ private class VolumeControlAudioMixer(
     override fun addSource(sourceFormat: AudioProcessor.AudioFormat, startTimeUs: Long): Int {
         val sourceId = delegate.addSource(sourceFormat, startTimeUs)
 
-        // Determine which volume to apply based on source order and whether video audio is present
+        // Determine which volume to apply based on source order
         val volume: Float
         val sourceType: String
 
-        if (videoAudioPresent) {
-            // Both video and audio tracks present (mixing mode)
-            // Source 0 = Video audio, Source 1..N = Audio tracks
-            if (sourceCount == 0) {
-                volume = 1.0f  // Video audio at full volume (per-clip volume not available in mixer mode)
-                sourceType = "VIDEO AUDIO"
-            } else {
-                val trackIndex = sourceCount - 1
-                volume = trackVolumes.getOrElse(trackIndex) { 1.0f }
-                sourceType = "AUDIO TRACK $trackIndex"
-            }
+        if (sourceCount < videoAudioSourceCount) {
+            // Source is a video audio track
+            volume = videoSequenceVolumes.getOrElse(sourceCount) { 1.0f }
+            sourceType = "VIDEO AUDIO (Sequence $sourceCount)"
         } else {
-            // Video audio was removed - only audio tracks present
-            val trackIndex = sourceCount
+            // Source is an audio track
+            val trackIndex = sourceCount - videoAudioSourceCount
             volume = trackVolumes.getOrElse(trackIndex) { 1.0f }
-            sourceType = "AUDIO TRACK $trackIndex (no video audio)"
+            sourceType = "AUDIO TRACK $trackIndex"
         }
 
         Log.d(
diff --git a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/models/RenderConfig.kt b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/models/RenderConfig.kt
index bb71bc3..a6f443c 100644
--- a/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/models/RenderConfig.kt
+++ b/android/src/main/kotlin/ch/waio/pro_video_editor/src/features/render/models/RenderConfig.kt
@@ -5,18 +5,32 @@ import ch.waio.pro_video_editor.src.shared.logging.PluginLog as Log
 import io.flutter.plugin.common.MethodCall
 
 /**
- * Represents a video clip segment with optional trimming.
+ * Represents a video clip segment with optional trimming and composition parameters.
  * 
  * @property inputPath Absolute path to video file
  * @property startUs Start time in microseconds (null = from beginning)
  * @property endUs End time in microseconds (null = until end)
  * @property volume Volume multiplier for this clip (null = unchanged, 0.0=mute, 1.0=original)
+ * @property x Horizontal offset in pixels from the left edge
+ * @property y Vertical offset in pixels from the top edge
+ * @property width Target width of the segment in pixels (null = original)
+ * @property height Target height of the segment in pixels (null = original)
+ * @property zIndex Layer order (higher values on top)
+ * @property opacity Transparency (0.0=invisible, 1.0=opaque)
+ * @property segmentTimeUs Absolute start time in the composition (null = sequential)
  */
 data class VideoClip(
     val inputPath: String,
     val startUs: Long?,
     val endUs: Long?,
-    val volume: Float? = null
+    val volume: Float? = null,
+    val x: Double? = null,
+    val y: Double? = null,
+    val width: Double? = null,
+    val height: Double? = null,
+    val zIndex: Int? = null,
+    val opacity: Float? = null,
+    val segmentTimeUs: Long? = null
 )
 
 /**
@@ -176,6 +190,8 @@ data class RenderConfig(
     val cropY: Int? = null,
     val scaleX: Float? = null,
     val scaleY: Float? = null,
+    val renderWidth: Int? = null,
+    val renderHeight: Int? = null,
     val bitrate: Int? = null,
     val enableAudio: Boolean = true,
     val playbackSpeed: Float? = null,
@@ -231,11 +247,20 @@ data class RenderConfig(
                     inputPath = clipMap["inputPath"] as String,
                     startUs = (clipMap["startUs"] as? Number)?.toLong(),
                     endUs = (clipMap["endUs"] as? Number)?.toLong(),
-                    volume = (clipMap["volume"] as? Number)?.toFloat()
+                    volume = (clipMap["volume"] as? Number)?.toFloat(),
+                    x = (clipMap["x"] as? Number)?.toDouble(),
+                    y = (clipMap["y"] as? Number)?.toDouble(),
+                    width = (clipMap["width"] as? Number)?.toDouble(),
+                    height = (clipMap["height"] as? Number)?.toDouble(),
+                    zIndex = (clipMap["zIndex"] as? Number)?.toInt(),
+                    opacity = (clipMap["opacity"] as? Number)?.toFloat(),
+                    segmentTimeUs = (clipMap["segmentTimeUs"] as? Number)?.toLong()
                 )
                 Log.d(
                     PACKAGE_TAG,
-                    "Clip $index: path=${clip.inputPath}, start=${clip.startUs}, end=${clip.endUs}, volume=${clip.volume}"
+                    "Clip $index: path=${clip.inputPath}, start=${clip.startUs}, end=${clip.endUs}, " +
+                            "volume=${clip.volume}, pos=(${clip.x}, ${clip.y}), size=${clip.width}x${clip.height}, " +
+                            "zIndex=${clip.zIndex}, opacity=${clip.opacity}, time=${clip.segmentTimeUs}"
                 )
                 clip
             }
@@ -292,6 +317,8 @@ data class RenderConfig(
                 cropY = call.argument<Number>("cropY")?.toInt(),
                 scaleX = call.argument<Number>("scaleX")?.toFloat(),
                 scaleY = call.argument<Number>("scaleY")?.toFloat(),
+                renderWidth = call.argument<Number>("renderWidth")?.toInt(),
+                renderHeight = call.argument<Number>("renderHeight")?.toInt(),
                 bitrate = call.argument<Number>("bitrate")?.toInt(),
                 enableAudio = call.argument<Boolean>("enableAudio") ?: true,
                 playbackSpeed = call.argument<Number>("playbackSpeed")?.toFloat(),
diff --git a/example/lib/features/render/video_renderer_page.dart b/example/lib/features/render/video_renderer_page.dart
index 42ca328..d691a6c 100644
--- a/example/lib/features/render/video_renderer_page.dart
+++ b/example/lib/features/render/video_renderer_page.dart
@@ -47,11 +47,10 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
   final double _blurFactor = 0;
   final List<List<double>> _colorFilters = [];
 
-  // kBasicFilterMatrix   kComplexFilterMatrix
-
   VideoMetadata? _outputMetadata;
 
   String _taskId = DateTime.now().microsecondsSinceEpoch.toString();
+  String? _error;
 
   late final EditorVideo _video;
 
@@ -373,7 +372,7 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
       videoSegments: [
         VideoSegment(
           video: _video,
-          startTime: const Duration(seconds: 0),
+          startTime: Duration.zero,
           endTime: const Duration(seconds: 7),
           volume: 1.0,
         ),
@@ -636,6 +635,215 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
     await _renderVideo(data);
   }
 
+  /// Picture in picture example. Combined video segments.
+  ///
+  /// This example demonstrates how to render composite video from multiple
+  /// overlapping video segments
+  Future<void> _combinedPip() async {
+    VideoMetadata meta = await _pve.getMetadata(_video);
+    var Size(width: width, height: height) = meta.resolution;
+
+    var data = VideoRenderData(
+      videoSegments: [
+        // the first segment is used to drive the video resolution
+        VideoSegment(
+          video: _video,
+          segmentTime: Duration.zero,
+          startTime: Duration.zero,
+          endTime: const Duration(seconds: 10),
+          offset: const Offset(100, 0),
+          // size: Size(width, height),
+          zIndex: 0,
+          volume: 1.0,
+        ),
+        VideoSegment(
+          video: _video,
+          segmentTime: const Duration(seconds: 5),
+          startTime: const Duration(seconds: 5),
+          endTime: const Duration(seconds: 15),
+          offset: const Offset(20, 20),
+          size: Size(width / 2, height / 2),
+          zIndex: 10,
+          volume: 0,
+        ),
+      ],
+
+      // imageLayers: [
+      //   // Stretched overlay for entire video
+      //   ImageLayer(image: EditorLayerImage.memory(imageBytes)),
+      //   // Sticker visible only from 3s–8s
+      //   ImageLayer(
+      //     image: stickerImage,
+      //     offset: const Offset(500, 150),
+      //     startTime: const Duration(seconds: 3),
+      //     endTime: const Duration(seconds: 8),
+      //   ),
+      // ],
+      // audioTracks: [
+      //   // Background music in second half at low volume
+      //   VideoAudioTrack(
+      //     path: audioFile.path,
+      //     volume: 0.4,
+      //     startTime: const Duration(seconds: 10),
+      //   ),
+      // ],
+    );
+
+    await _renderVideo(data);
+  }
+
+  /// Video grid example. Combined video segments.
+  ///
+  /// This example demonstrates how to render composite video with multiple
+  /// stacked video segments
+  Future<void> _combinedStack() async {
+    VideoMetadata meta1 = await _pve.getMetadata(_video);
+    var Size(width: width1, height: height1) = meta1.resolution;
+
+    EditorVideo video2 = EditorVideo.asset(kVideoEditorExampleAssetWorldPath);
+    VideoMetadata meta2 = await _pve.getMetadata(video2);
+    var Size(width: width2, height: height2) = meta2.resolution;
+
+    double width = max(width1, width2);
+    double scale1 = width / width1;
+    double scale2 = width / width2;
+
+    // resize one of the videos to match the other video width
+    // stack them vertically video1 on top of video2
+    double height = height1 * scale1 + height2 * scale2;
+
+    // int duration = min(meta1.duration.inSeconds, meta2.duration.inSeconds);
+    debugPrint('target resolution: $width x $height');
+
+    var data = VideoRenderData(
+      qualityConfig: VideoQualityConfig.custom(
+        bitrate: meta1.bitrate,
+        resolution: Size(width, height),
+      ),
+      videoSegments: [
+        VideoSegment(
+          video: _video,
+          segmentTime: Duration.zero,
+          startTime: Duration.zero,
+          // endTime: Duration(seconds: duration),
+          endTime: const Duration(seconds: 5),
+          offset: const Offset(0, 0),
+          size: Size(width1 * scale1, height1 * scale1),
+          zIndex: 10,
+          volume: 1.0,
+        ),
+
+        VideoSegment(
+          video: video2,
+          segmentTime: Duration.zero,
+          startTime: Duration.zero,
+          // endTime: Duration(seconds: duration),
+          endTime: const Duration(seconds: 5),
+          offset: Offset(0, height1 * scale1),
+          size: Size(width2 * scale2, height2 * scale2),
+          volume: 0,
+        ),
+      ],
+    );
+
+    await _renderVideo(data);
+  }
+
+  /// Video grid example. Combined video segments.
+  ///
+  /// This example demonstrates how to render composite video from multiple
+  /// overlapping video segments
+  Future<void> _combinedGrid() async {
+    VideoMetadata metadata = await _pve.getMetadata(_video);
+
+    Size resolution = metadata.resolution;
+    double width = resolution.width;
+    double height = resolution.height;
+
+    // red image with opacity bytes for interleaving test
+    final redRecorder = ui.PictureRecorder();
+    ui.Canvas(redRecorder).drawRect(
+      const ui.Rect.fromLTWH(0, 0, 500, 500),
+      ui.Paint()..color = Colors.red.withValues(alpha: 0.5),
+    );
+    ui.Picture redPicture = redRecorder.endRecording();
+    ui.Image redImg = await redPicture.toImage(500, 500);
+    ByteData? bytes = await redImg.toByteData(format: ui.ImageByteFormat.png);
+    Uint8List redImageBytes = bytes!.buffer.asUint8List();
+    redPicture.dispose();
+
+    var data = VideoRenderData(
+      qualityConfig: VideoQualityConfig.custom(
+        bitrate: metadata.bitrate,
+        resolution: Size(width * 2, height * 2),
+      ),
+      videoSegments: [
+        VideoSegment(
+          video: _video,
+          segmentTime: Duration.zero,
+          startTime: Duration.zero,
+          endTime: const Duration(seconds: 10),
+          offset: const Offset(0, 0),
+          size: Size(width, height),
+          zIndex: 10,
+          volume: 1.0,
+        ),
+        VideoSegment(
+          video: _video,
+          segmentTime: const Duration(seconds: 3),
+          startTime: const Duration(seconds: 3),
+          endTime: const Duration(seconds: 13),
+          offset: Offset(width - 50, 50),
+          size: Size(width, height),
+          zIndex: 20,
+          volume: 1.0,
+        ),
+        VideoSegment(
+          video: _video,
+          segmentTime: const Duration(seconds: 6),
+          startTime: const Duration(seconds: 6),
+          endTime: const Duration(seconds: 16),
+          offset: Offset(width, height),
+          size: Size(width, height),
+          zIndex: 30,
+          volume: 1.0,
+        ),
+        VideoSegment(
+          video: _video,
+          segmentTime: const Duration(seconds: 9),
+          startTime: const Duration(seconds: 9),
+          endTime: const Duration(seconds: 20),
+          offset: Offset(50, height - 50),
+          size: Size(width, height),
+          zIndex: 40,
+          volume: 1.0,
+        ),
+
+        // 480 × 270
+        VideoSegment(
+          video: EditorVideo.asset(kVideoEditorExampleAssetWorldPath),
+          segmentTime: Duration.zero,
+          zIndex: 50,
+          opacity: 0.5,
+          offset: Offset(width - 600, height - 200),
+          size: const Size(480 * 2, 270 * 2),
+          volume: 0,
+        ),
+      ],
+
+      imageLayers: [
+        // Transparent red square at the top
+        ImageLayer(
+          image: EditorLayerImage.memory(redImageBytes),
+          offset: Offset(width - 250, height - 300),
+          size: const Size(500, 500),
+        ),
+      ],
+    );
+
+    await _renderVideo(data);
+  }
+
   /// Fade animation on image layer.
   ///
   /// This example demonstrates a simple fade-in and fade-out animation
@@ -805,7 +1013,7 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
       videoSegments: [
         VideoSegment(
           video: _video,
-          startTime: const Duration(seconds: 0),
+          startTime: Duration.zero,
           endTime: const Duration(seconds: 5),
         ),
         VideoSegment(
@@ -831,7 +1039,7 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
       videoSegments: [
         VideoSegment(
           video: _video,
-          startTime: const Duration(seconds: 0),
+          startTime: Duration.zero,
           endTime: const Duration(seconds: 5),
         ),
         VideoSegment(
@@ -934,6 +1142,7 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
 
   Future<void> _renderVideo(VideoRenderData value) async {
     _taskId = DateTime.now().microsecondsSinceEpoch.toString();
+    _error = null;
     setState(() => _isExporting = true);
 
     final directory = await getTemporaryDirectory();
@@ -949,8 +1158,15 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
     } on RenderCanceledException {
       setState(() => _isExporting = false);
       return;
+    } catch (ex) {
+      setState(() {
+        _error = 'Failed to render video: $ex';
+        _isExporting = false;
+      });
+      return;
     }
 
+    debugPrint('output $outputPath');
     final result = File(outputPath).readAsBytesSync();
 
     _generationTime = sp.elapsed;
@@ -1112,10 +1328,13 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
               ),
               Text(
                 'Result: ${formatBytes(_videoBytes!.lengthInBytes)} '
-                'bytes in ${_generationTime.inMilliseconds}ms',
+                'bytes in ${_generationTime.inMilliseconds}ms, '
+                '${_outputMetadata?.resolution.width ?? 0}'
+                ' x ${_outputMetadata?.resolution.height ?? 0}',
               ),
               if (_outputMetadata?.isOptimizedForStreaming != null)
                 Row(
+                  spacing: 6,
                   children: [
                     Icon(
                       _outputMetadata!.isOptimizedForStreaming!
@@ -1126,7 +1345,6 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
                           : Colors.red,
                       size: 18,
                     ),
-                    const SizedBox(width: 6),
                     Text(
                       _outputMetadata!.isOptimizedForStreaming!
                           ? 'Optimized for streaming (moov before mdat)'
@@ -1134,6 +1352,19 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
                     ),
                   ],
                 ),
+              if (_error != null)
+                Row(
+                  crossAxisAlignment: CrossAxisAlignment.start,
+                  spacing: 6,
+                  children: [
+                    const Icon(
+                      Icons.warning_rounded,
+                      color: Colors.red,
+                      size: 20,
+                    ),
+                    Expanded(child: Text('$_error')),
+                  ],
+                ),
             ],
     );
   }
@@ -1183,11 +1414,13 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
           onTap: _layers,
           leading: const Icon(Icons.layers_outlined),
           title: const Text('Parse with layers'),
+          subtitle: const Text('Layer for the whole video duration'),
         ),
         ListTile(
           onTap: _layersTimed,
           leading: const Icon(Icons.av_timer_outlined),
           title: const Text('Parse with timed layers'),
+          subtitle: const Text('Layers at 0, 5, 7 + random one every second'),
         ),
         ListTile(
           onTap: _layersWithSize,
@@ -1215,6 +1448,7 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
           onTap: _multipleChanges,
           leading: const Icon(Icons.web_stories_outlined),
           title: const Text('Multiple changes'),
+          subtitle: const Text('FlipX, image, color filter, crop'),
         ),
         ListTile(
           onTap: _combinedTimeBased,
@@ -1222,6 +1456,24 @@ class _VideoRendererPageState extends State<VideoRendererPage> {
           title: const Text('Combined Time-Based'),
           subtitle: const Text('Clips + filters + layers + audio, all timed'),
         ),
+        ListTile(
+          onTap: _combinedPip,
+          leading: const Icon(Icons.picture_in_picture_alt),
+          title: const Text('Picture in picture'),
+          subtitle: const Text('Pip starts at 5, main ends at 10'),
+        ),
+        ListTile(
+          onTap: _combinedStack,
+          leading: const Icon(Icons.stacked_line_chart),
+          title: const Text('Video stack'),
+          subtitle: const Text('Two videos stacked together'),
+        ),
+        ListTile(
+          onTap: _combinedGrid,
+          leading: const Icon(Icons.dashboard_outlined),
+          title: const Text('Video grid'),
+          subtitle: const Text('Grid of videos, with overlays and opacity'),
+        ),
         ListTile(
           onTap: _bitrate,
           leading: const Icon(Icons.animation),
diff --git a/ios/Classes/src/features/render/RenderVideo.swift b/ios/Classes/src/features/render/RenderVideo.swift
index 5a78609..4eddb3f 100644
--- a/ios/Classes/src/features/render/RenderVideo.swift
+++ b/ios/Classes/src/features/render/RenderVideo.swift
@@ -74,7 +74,14 @@ class RenderVideo {
                                 inputPath: newPath,
                                 startUs: clip.startUs,
                                 endUs: clip.endUs,
-                                volume: clip.volume
+                                volume: clip.volume,
+                                opacity: clip.opacity,
+                                x: clip.x,
+                                y: clip.y,
+                                width: clip.width,
+                                height: clip.height,
+                                segmentTimeUs: clip.segmentTimeUs,
+                                zIndex: clip.zIndex
                             )
                         }
                         return clip
@@ -138,13 +145,16 @@ class RenderVideo {
                     var effectsConfig = VideoCompositorConfig()
 
                     // Use composition helper to merge multiple video clips
-                    let (composition, videoCompData, renderSize, audioMix, sourceTrackID) =
+                    let (composition, videoCompData, renderSize, audioMix, sourceTrackID, updatedEffectsConfig) =
                         try await applyComposition(
                             videoClips: workingConfig.videoClips,
                             videoEffects: effectsConfig,
                             enableAudio: workingConfig.enableAudio,
-                            audioTracks: workingConfig.audioTracks
+                            audioTracks: workingConfig.audioTracks,
+                            renderWidth: workingConfig.renderWidth,
+                            renderHeight: workingConfig.renderHeight
                         )
+                    effectsConfig = updatedEffectsConfig
                     var videoCompConfig = videoCompData
 
                     // Set source track ID for fallback on older iOS versions (e.g., iPhone 7)
@@ -235,9 +245,17 @@ class RenderVideo {
                     videoComposition.frameDuration = videoCompConfig.frameDuration
                     videoComposition.renderSize = finalRenderSize
                     videoComposition.instructions = videoCompConfig.instructions
+
+                    // Ensure compositor knows the intended logical size for coordinate mapping
+                    effectsConfig.intendedRenderSize = finalRenderSize
+
                     videoComposition.customVideoCompositorClass = makeVideoCompositorSubclass(with: effectsConfig)
 
-                    let preset = applyBitrate(requestedBitrate: workingConfig.bitrate)
+                    let preset = applyBitrate(
+                        requestedBitrate: workingConfig.bitrate,
+                        renderWidth: workingConfig.renderWidth,
+                        renderHeight: workingConfig.renderHeight
+                    )
 
                     let export = try await prepareExportSession(
                         composition: composition,
diff --git a/ios/Classes/src/features/render/helpers/ApplyBitrate.swift b/ios/Classes/src/features/render/helpers/ApplyBitrate.swift
index 9b84b4a..d441852 100644
--- a/ios/Classes/src/features/render/helpers/ApplyBitrate.swift
+++ b/ios/Classes/src/features/render/helpers/ApplyBitrate.swift
@@ -7,8 +7,9 @@ import AVFoundation
 /// resolution/quality presets.
 ///
 /// - Parameters:
-///   - requestedBitrate: Target bitrate in bits per second. If nil, returns preset hint or highest quality.
-///   - presetHint: Optional preset to use as fallback. If nil, defaults to highest quality.
+///   - requestedBitrate: Target bitrate in bits per second. If nil, returns highest quality.
+///   - renderWidth: Optional target render width to ensure preset supports the resolution.
+///   - renderHeight: Optional target render height to ensure preset supports the resolution.
 /// - Returns: AVAssetExportPreset string matching the requested quality level.
 ///
 /// Bitrate mapping:
@@ -23,7 +24,28 @@ import AVFoundation
 /// - ≥2 Mbps: 480p
 /// - ≥1 Mbps: Medium quality
 /// - <1 Mbps: Low quality
-public func applyBitrate(requestedBitrate: Int?, presetHint: String? = nil) -> String {
+public func applyBitrate(
+    requestedBitrate: Int?,
+    renderWidth: Double? = nil,
+    renderHeight: Double? = nil
+) -> String {
+    // If a custom resolution is provided, we should ideally use a "HighestQuality"
+    // preset to avoid resolution constraints from bitrate-based presets.
+    // However, if a bitrate is also specified, we'll try to pick the best matching one.
+    if let rw = renderWidth, let rh = renderHeight {
+        let maxDim = max(rw, rh)
+
+        if maxDim > 1920 {
+            if #available(iOS 11.0, *) {
+                return AVAssetExportPresetHEVC3840x2160
+            } else {
+                return AVAssetExportPreset3840x2160
+            }
+        } else if maxDim > 1280 {
+            return AVAssetExportPreset1920x1080
+        }
+    }
+
     if let bitrate = requestedBitrate {
         PluginLog.print(
             "[\(Tags.render)] 📊 Requested bitrate: \(bitrate) bps (\(String(format: "%.1f", Double(bitrate) / 1_000_000)) Mbps)"
@@ -73,5 +95,5 @@ public func applyBitrate(requestedBitrate: Int?, presetHint: String? = nil) -> S
         }
     }
 
-    return presetHint ?? AVAssetExportPresetHighestQuality
+    return AVAssetExportPresetHighestQuality
 }
diff --git a/ios/Classes/src/features/render/helpers/ApplyComposition.swift b/ios/Classes/src/features/render/helpers/ApplyComposition.swift
index 98d3080..115d3c8 100644
--- a/ios/Classes/src/features/render/helpers/ApplyComposition.swift
+++ b/ios/Classes/src/features/render/helpers/ApplyComposition.swift
@@ -19,18 +19,22 @@ import Foundation
 ///   - CGSize: Final render size (max dimensions from all clips)
 ///   - AVAudioMix?: Audio mix with volume controls (nil if no audio mixing needed)
 ///   - CMPersistentTrackID: The track ID of the video composition track (for fallback on older iOS)
+///   - VideoCompositorConfig: Updated compositor configuration with track info
 ///
 /// - Throws: NSError if video clips are empty, files don't exist, or tracks can't be loaded.
 func applyComposition(
     videoClips: [VideoClip],
     videoEffects: VideoCompositorConfig,
     enableAudio: Bool,
-    audioTracks: [AudioTrackConfig]
+    audioTracks: [AudioTrackConfig],
+    renderWidth: Double? = nil,
+    renderHeight: Double? = nil
 ) async throws -> (
-    AVMutableComposition, VideoCompositionData, CGSize, AVAudioMix?, CMPersistentTrackID
+    AVMutableComposition, VideoCompositionData, CGSize, AVAudioMix?, CMPersistentTrackID, VideoCompositorConfig
 ) {
     return try await CompositionBuilder(videoClips: videoClips, videoEffects: videoEffects)
         .setEnableAudio(enableAudio)
         .setAudioTracks(audioTracks)
+        .setRenderSize(width: renderWidth, height: renderHeight)
         .build()
 }
diff --git a/ios/Classes/src/features/render/helpers/ApplyPlaybackSpeed.swift b/ios/Classes/src/features/render/helpers/ApplyPlaybackSpeed.swift
index 4099370..6d54575 100644
--- a/ios/Classes/src/features/render/helpers/ApplyPlaybackSpeed.swift
+++ b/ios/Classes/src/features/render/helpers/ApplyPlaybackSpeed.swift
@@ -43,10 +43,10 @@ public func applyPlaybackSpeed(
         }
         let scaledStart = CMTimeMultiplyByFloat64(custom.timeRange.start, multiplier: multiplier)
         let scaledDuration = CMTimeMultiplyByFloat64(custom.timeRange.duration, multiplier: multiplier)
-        let trackID = (custom.requiredSourceTrackIDs?.first as? NSNumber)?.int32Value ?? kCMPersistentTrackID_Invalid
+        let trackIDs = custom.requiredSourceTrackIDs?.compactMap { ($0 as? NSNumber)?.int32Value } ?? []
         return CustomVideoCompositionInstruction(
             timeRange: CMTimeRange(start: scaledStart, duration: scaledDuration),
-            sourceTrackID: trackID,
+            sourceTrackIDs: trackIDs,
             layerInstructions: custom.layerInstructions,
             backgroundColor: custom.backgroundColor
         )
diff --git a/ios/Classes/src/features/render/helpers/CompositionBuilder.swift b/ios/Classes/src/features/render/helpers/CompositionBuilder.swift
index 0590ec2..e2ef6d8 100644
--- a/ios/Classes/src/features/render/helpers/CompositionBuilder.swift
+++ b/ios/Classes/src/features/render/helpers/CompositionBuilder.swift
@@ -12,6 +12,8 @@ internal class CompositionBuilder {
     private let videoEffects: VideoCompositorConfig
     private var enableAudio: Bool = true
     private var audioTracks: [AudioTrackConfig] = []
+    private var renderWidth: Double?
+    private var renderHeight: Double?
 
     /// Initializes builder with configuration.
     ///
@@ -23,6 +25,13 @@ internal class CompositionBuilder {
         self.videoEffects = videoEffects
     }
 
+    /// Sets the target render size.
+    func setRenderSize(width: Double?, height: Double?) -> CompositionBuilder {
+        self.renderWidth = width
+        self.renderHeight = height
+        return self
+    }
+
     /// Enables or disables audio.
     ///
     /// - Parameter enabled: If true, includes original audio from video clips
@@ -46,7 +55,7 @@ internal class CompositionBuilder {
     /// - Returns: Tuple containing composition, video composition, render size, audio mix, and source track ID
     /// - Throws: Error if composition creation fails
     func build() async throws -> (
-        AVMutableComposition, VideoCompositionData, CGSize, AVAudioMix?, CMPersistentTrackID
+        AVMutableComposition, VideoCompositionData, CGSize, AVAudioMix?, CMPersistentTrackID, VideoCompositorConfig
     ) {
         guard !videoClips.isEmpty else {
             throw NSError(
@@ -64,9 +73,14 @@ internal class CompositionBuilder {
         // Build video sequence
         let videoBuilder = VideoSequenceBuilder(videoClips: videoClips)
             .setEnableAudio(enableAudio)
+            .setRenderSize(width: renderWidth, height: renderHeight)
 
         let videoResult = try await videoBuilder.build(in: composition)
 
+        // Store track configs for compositor
+        var updatedVideoEffects = videoEffects
+        updatedVideoEffects.videoClipConfigs = videoResult.trackConfigs
+
         // Add custom audio tracks
         var customAudioTracks: [(track: AVMutableCompositionTrack, config: AudioTrackConfig)] = []
         for trackConfig in audioTracks {
@@ -106,9 +120,7 @@ internal class CompositionBuilder {
         )
         let compositionRenderSize = videoResult.renderSize
 
-        // Create instructions for each clip segment
-        // Use custom instruction class to ensure requiredSourceTrackIDs is properly set
-        // This fixes issues on older iOS versions (e.g., iPhone 7, iOS 15)
+        // Create instructions for each non-overlapping time segment
         var instructions: [AVVideoCompositionInstructionProtocol] = []
 
         PluginLog.print("")
@@ -120,6 +132,8 @@ internal class CompositionBuilder {
         PluginLog.print("==========================================")
         PluginLog.print("")
 
+        // Calculate pre-determined transforms for all clips
+        var clipTransforms: [CGAffineTransform] = []
         for (index, clipInstruction) in videoResult.clipInstructions.enumerated() {
             PluginLog.print("🎬 Processing instruction for clip \(index)")
             PluginLog.print(
@@ -133,36 +147,55 @@ internal class CompositionBuilder {
                 with: clipInstruction.transform,
                 clipIndex: index
             )
+            clipTransforms.append(transform)
+        }
 
-            let layerInstruction: AVVideoCompositionLayerInstruction
-            if #available(iOS 26.0, *) {
-                var config = AVVideoCompositionLayerInstruction.Configuration(
-                    assetTrack: videoResult.videoTrack
-                )
-                config.setTransform(transform, at: .zero)
-                layerInstruction = AVVideoCompositionLayerInstruction(configuration: config)
-            } else {
-                let mutableInstruction = AVMutableVideoCompositionLayerInstruction(
-                    assetTrack: videoResult.videoTrack
-                )
-                mutableInstruction.setTransform(transform, at: .zero)
-                layerInstruction = mutableInstruction
-            }
-
-            // Use custom instruction that explicitly provides requiredSourceTrackIDs
-            let instruction = CustomVideoCompositionInstruction(
-                timeRange: clipInstruction.timeRange,
-                sourceTrackID: videoResult.videoTrack.trackID,
-                layerInstructions: [layerInstruction],
-                backgroundColor: CGColor(red: 0, green: 0, blue: 0, alpha: 1)
-            )
+        // Calculate non-overlapping time segments
+        let segments = calculateSegments(
+            from: videoResult.clipInstructions,
+            totalDuration: videoResult.totalDuration
+        )
 
+        for (segIndex, segmentRange) in segments.enumerated() {
+            PluginLog.print("🎬 Processing segment \(segIndex)")
             PluginLog.print(
-                "   ⚙️ Layer instruction configured with transform (trackID: \(videoResult.videoTrack.trackID))"
+                "   Time range: \(String(format: "%.2f", segmentRange.start.seconds))s - \(String(format: "%.2f", (segmentRange.start + segmentRange.duration).seconds))s"
             )
-            PluginLog.print("")
 
-            instructions.append(instruction)
+            var activeTrackIDs: [CMPersistentTrackID] = []
+            var layerInstructions: [AVVideoCompositionLayerInstruction] = []
+
+            for (clipIndex, clipInstruction) in videoResult.clipInstructions.enumerated() {
+                // Check if this clip is active during this segment
+                let clipRange = clipInstruction.timeRange
+                let intersection = CMTimeRangeGetIntersection(segmentRange, otherRange: clipRange)
+
+                if CMTimeGetSeconds(intersection.duration) > 0 {
+                    activeTrackIDs.append(clipInstruction.trackID)
+
+                    let transform = clipTransforms[clipIndex]
+                    let mutableLayerInstruction = AVMutableVideoCompositionLayerInstruction(
+                        assetTrack: composition.track(withTrackID: clipInstruction.trackID)!
+                    )
+                    mutableLayerInstruction.setTransform(transform, at: .zero)
+                    layerInstructions.append(mutableLayerInstruction)
+
+                    PluginLog.print("   - Added trackID \(clipInstruction.trackID) (Clip \(clipIndex))")
+                }
+            }
+
+            if !layerInstructions.isEmpty {
+                // Use custom instruction that explicitly provides requiredSourceTrackIDs
+                let instruction = CustomVideoCompositionInstruction(
+                    timeRange: segmentRange,
+                    sourceTrackIDs: activeTrackIDs,
+                    layerInstructions: layerInstructions,
+                    backgroundColor: CGColor(red: 0, green: 0, blue: 0, alpha: 1)
+                )
+                instructions.append(instruction)
+                PluginLog.print("   ✅ Segment instruction created with \(layerInstructions.count) layers")
+            }
+            PluginLog.print("")
         }
 
         let videoCompositionData = VideoCompositionData(
@@ -173,10 +206,10 @@ internal class CompositionBuilder {
 
         PluginLog.print("✅ Composition created successfully with \(videoClips.count) clips")
 
-        // Return the track ID for fallback on older iOS versions
-        let sourceTrackID = videoResult.videoTrack.trackID
+        // Return the first track ID for fallback on older iOS versions
+        let sourceTrackID = videoResult.clipInstructions.first?.trackID ?? kCMPersistentTrackID_Invalid
 
-        return (composition, videoCompositionData, videoResult.renderSize, audioMix, sourceTrackID)
+        return (composition, videoCompositionData, videoResult.renderSize, audioMix, sourceTrackID, updatedVideoEffects)
     }
 
     /// Creates audio mix with per-clip and per-track volume parameters.
@@ -191,11 +224,17 @@ internal class CompositionBuilder {
         for track in originalTracks {
             let inputParameters = AVMutableAudioMixInputParameters(track: track)
 
-            // Use setVolumeRamp for each clip's time range to ensure
-            // volume changes are applied precisely per segment
-            for (index, clipInstruction) in clipInstructions.enumerated() {
+            // Find all instructions that apply to this specific audio track
+            let relevantInstructions = clipInstructions.enumerated().filter { _, instruction in
+                instruction.audioTrackID == track.trackID
+            }
+
+            for (index, clipInstruction) in relevantInstructions {
                 let clipVolume = index < videoClips.count
                     ? (videoClips[index].volume ?? 1.0) : 1.0
+
+                PluginLog.print("🔊 Setting volume ramp for track \(track.trackID): volume=\(clipVolume) at \(String(format: "%.2f", clipInstruction.timeRange.start.seconds))s")
+
                 inputParameters.setVolumeRamp(
                     fromStartVolume: clipVolume,
                     toEndVolume: clipVolume,
@@ -204,7 +243,7 @@ internal class CompositionBuilder {
             }
 
             audioMixInputParameters.append(inputParameters)
-            PluginLog.print("🔊 Applied per-clip volume to original audio track")
+            PluginLog.print("🔊 Applied per-clip volume to original audio track (ID: \(track.trackID))")
         }
 
         // Apply volume to custom audio tracks
@@ -234,6 +273,16 @@ internal class CompositionBuilder {
         with preferredTransform: CGAffineTransform,
         clipIndex: Int
     ) -> CGAffineTransform {
+        // If manual positioning is requested, use the preferred transform as-is.
+        // The compositor will handle custom scaling and positioning based on VideoClip config.
+        if clipIndex < videoClips.count {
+            let clip = videoClips[clipIndex]
+            if clip.x != nil || clip.y != nil || clip.width != nil || clip.height != nil {
+                PluginLog.print("   🎯 Manual positioning detected for clip \(clipIndex), skipping fit and center transform")
+                return preferredTransform
+            }
+        }
+
         // Get the display size after applying the original transform (handles rotation)
         let displaySize = naturalSize.applying(preferredTransform)
         let videoWidth = abs(displaySize.width)
@@ -315,4 +364,39 @@ internal class CompositionBuilder {
 
         return transform
     }
+
+    /// Calculates non-overlapping time segments from clip instructions.
+    private func calculateSegments(from instructions: [ClipInstruction], totalDuration: CMTime) -> [CMTimeRange] {
+        var points: [CMTime] = [.zero, totalDuration]
+        for instruction in instructions {
+            points.append(instruction.timeRange.start)
+            points.append(CMTimeAdd(instruction.timeRange.start, instruction.timeRange.duration))
+        }
+
+        let sortedPoints = points
+            .filter { CMTimeCompare($0, totalDuration) <= 0 }
+            .sorted { CMTimeCompare($0, $1) < 0 }
+
+        var uniquePoints: [CMTime] = []
+        for point in sortedPoints {
+            if let last = uniquePoints.last {
+                if CMTimeCompare(last, point) != 0 {
+                    uniquePoints.append(point)
+                }
+            } else {
+                uniquePoints.append(point)
+            }
+        }
+
+        var segments: [CMTimeRange] = []
+        for i in 0..<uniquePoints.count - 1 {
+            let start = uniquePoints[i]
+            let end = uniquePoints[i+1]
+            let duration = CMTimeSubtract(end, start)
+            if CMTimeGetSeconds(duration) > 0 {
+                segments.append(CMTimeRange(start: start, duration: duration))
+            }
+        }
+        return segments
+    }
 }
diff --git a/ios/Classes/src/features/render/helpers/VideoSequenceBuilder.swift b/ios/Classes/src/features/render/helpers/VideoSequenceBuilder.swift
index 56f93b3..32c010a 100644
--- a/ios/Classes/src/features/render/helpers/VideoSequenceBuilder.swift
+++ b/ios/Classes/src/features/render/helpers/VideoSequenceBuilder.swift
@@ -9,6 +9,8 @@ internal class VideoSequenceBuilder {
 
     private let videoClips: [VideoClip]
     private var enableAudio: Bool = true
+    private var renderWidth: Double?
+    private var renderHeight: Double?
 
     /// Initializes builder with video clips.
     ///
@@ -17,6 +19,13 @@ internal class VideoSequenceBuilder {
         self.videoClips = videoClips
     }
 
+    /// Sets the target render size.
+    func setRenderSize(width: Double?, height: Double?) -> VideoSequenceBuilder {
+        self.renderWidth = width
+        self.renderHeight = height
+        return self
+    }
+
     /// Enables or disables audio in the output.
     ///
     /// - Parameter enabled: If true, includes original audio from video clips
@@ -67,7 +76,7 @@ internal class VideoSequenceBuilder {
     /// Builds the video composition with all clips.
     ///
     /// - Parameter composition: Composition to build into
-    /// - Returns: Tuple containing video track, audio tracks, render size, frame rate, and clip instructions
+    /// - Returns: Tuple containing video tracks, audio tracks, render size, frame rate, and clip instructions
     func build(in composition: AVMutableComposition) async throws -> VideoSequenceResult {
         guard !videoClips.isEmpty else {
             throw NSError(
@@ -85,32 +94,7 @@ internal class VideoSequenceBuilder {
         var maxFrameRate: Float = 30.0
         var originalAudioTracks: [AVMutableCompositionTrack] = []
         var clipInstructions: [ClipInstruction] = []
-
-        // Create single video track for all clips
-        guard
-            let compositionVideoTrack = composition.addMutableTrack(
-                withMediaType: .video,
-                preferredTrackID: kCMPersistentTrackID_Invalid
-            )
-        else {
-            throw NSError(
-                domain: "VideoSequenceBuilder",
-                code: 2,
-                userInfo: [NSLocalizedDescriptionKey: "Failed to create video track"]
-            )
-        }
-
-        // Create single shared audio track for all clips (if enabled)
-        var sharedAudioTrack: AVMutableCompositionTrack?
-        if enableAudio {
-            sharedAudioTrack = composition.addMutableTrack(
-                withMediaType: .audio,
-                preferredTrackID: kCMPersistentTrackID_Invalid
-            )
-            if sharedAudioTrack != nil {
-                PluginLog.print("🔊 Created SHARED audio track for all clips (will prevent empty segments)")
-            }
-        }
+        var trackConfigs: [CMPersistentTrackID: VideoClip] = [:]
 
         // Process each video clip
         for (index, clip) in videoClips.enumerated() {
@@ -156,15 +140,12 @@ internal class VideoSequenceBuilder {
             PluginLog.print("   - Display size: \(correctedSize.width) x \(correctedSize.height)")
             PluginLog.print("   - Frame rate: \(nominalFrameRate) fps")
 
-            // Update max render size
-            if correctedSize.width > maxRenderSize.width
-                || correctedSize.height > maxRenderSize.height
-            {
-                let oldSize = maxRenderSize
+            // Update max render size (only if not explicitly provided)
+            if index == 0 && (renderWidth == nil || renderHeight == nil) {
                 maxRenderSize = correctedSize
-                PluginLog.print(
-                    "   - ⬆️ Max render size updated: \(oldSize.width)x\(oldSize.height) → \(maxRenderSize.width)x\(maxRenderSize.height)"
-                )
+                PluginLog.print("   - 📏 Base render size set from first clip: \(maxRenderSize.width)x\(maxRenderSize.height)")
+            } else if renderWidth != nil && renderHeight != nil {
+                maxRenderSize = CGSize(width: renderWidth!, height: renderHeight!)
             }
 
             // Update max frame rate
@@ -176,62 +157,87 @@ internal class VideoSequenceBuilder {
             let clipTimeRange = await calculateTimeRange(for: clip, from: asset)
             let clipDuration = clipTimeRange.duration
 
+            // Determine insertion time in composition
+            let insertionTime: CMTime
+            if let segmentTimeUs = clip.segmentTimeUs {
+                insertionTime = CMTime(value: segmentTimeUs, timescale: 1_000_000)
+            } else {
+                insertionTime = totalDuration
+            }
+
+            // Create a new track for each clip to support overlapping and independent positioning
+            guard let compositionVideoTrack = composition.addMutableTrack(
+                withMediaType: .video,
+                preferredTrackID: kCMPersistentTrackID_Invalid
+            ) else {
+                throw NSError(
+                    domain: "VideoSequenceBuilder",
+                    code: 2,
+                    userInfo: [NSLocalizedDescriptionKey: "Failed to create video track for clip \(index)"]
+                )
+            }
+
             // Insert video clip into the composition track
             try compositionVideoTrack.insertTimeRange(
                 clipTimeRange,
                 of: videoTrack,
-                at: totalDuration
+                at: insertionTime
             )
 
+            // Track mapping for compositor
+            trackConfigs[compositionVideoTrack.trackID] = clip
+
+            // Add audio if enabled
+            var audioTrackID: CMPersistentTrackID? = nil
+            if enableAudio,
+                let audioTrack = try? await MediaInfoExtractor.loadAudioTrack(from: asset)
+            {
+                if let compositionAudioTrack = composition.addMutableTrack(
+                    withMediaType: .audio,
+                    preferredTrackID: kCMPersistentTrackID_Invalid
+                ) {
+                    do {
+                        try compositionAudioTrack.insertTimeRange(
+                            clipTimeRange,
+                            of: audioTrack,
+                            at: insertionTime
+                        )
+                        originalAudioTracks.append(compositionAudioTrack)
+                        audioTrackID = compositionAudioTrack.trackID
+                        PluginLog.print("   🔊 Audio inserted into its own track (ID: \(audioTrackID!))")
+                    } catch {
+                        PluginLog.print("   ❌ ERROR inserting audio: \(error.localizedDescription)")
+                    }
+                } else {
+                    PluginLog.print("   ⚠️ WARNING: Failed to create audio track for clip \(index)")
+                }
+            }
+
             // Store instruction for this clip segment
             clipInstructions.append(
                 ClipInstruction(
-                    timeRange: CMTimeRange(start: totalDuration, duration: clipDuration),
+                    timeRange: CMTimeRange(start: insertionTime, duration: clipDuration),
                     transform: preferredTransform,
                     naturalSize: naturalSize,
-                    renderSize: correctedSize
+                    renderSize: correctedSize,
+                    trackID: compositionVideoTrack.trackID,
+                    audioTrackID: audioTrackID
                 ))
 
-            // Add audio to shared track if enabled
-            if enableAudio,
-                let audioTrack = try? await MediaInfoExtractor.loadAudioTrack(from: asset),
-                let sharedAudioTrack = sharedAudioTrack
-            {
-                PluginLog.print("🔊 Processing audio for clip \(index)...")
-                PluginLog.print("   ✅ Audio track loaded from asset")
-                PluginLog.print("      Track ID: \(audioTrack.trackID)")
-                PluginLog.print(
-                    "      Duration: \(String(format: "%.2f", audioTrack.timeRange.duration.seconds))s"
-                )
-                PluginLog.print("      Format: \(audioTrack.mediaType)")
-
-                do {
-                    try sharedAudioTrack.insertTimeRange(
-                        clipTimeRange,
-                        of: audioTrack,
-                        at: totalDuration
-                    )
-                    PluginLog.print("   ✅ Audio inserted into SHARED track!")
-                    PluginLog.print(
-                        "      Source time range: \(String(format: "%.2f", clipTimeRange.start.seconds))s - \(String(format: "%.2f", (clipTimeRange.start + clipTimeRange.duration).seconds))s"
-                    )
-                    PluginLog.print(
-                        "      Inserted at composition time: \(String(format: "%.2f", totalDuration.seconds))s"
-                    )
-                    PluginLog.print(
-                        "      Audio duration: \(String(format: "%.2f", clipTimeRange.duration.seconds))s"
-                    )
-                } catch {
-                    PluginLog.print("   ❌ ERROR inserting audio: \(error.localizedDescription)")
-                    PluginLog.print("      Error details: \(error)")
+            // Update total duration (sequential part)
+            if clip.segmentTimeUs == nil {
+                totalDuration = CMTimeAdd(totalDuration, clipDuration)
+            } else {
+                let endInComposition = CMTimeAdd(insertionTime, clipDuration)
+                if CMTimeCompare(endInComposition, totalDuration) > 0 {
+                    totalDuration = endInComposition
                 }
             }
 
-            totalDuration = CMTimeAdd(totalDuration, clipDuration)
             PluginLog.print("✅ Clip \(index) added successfully")
             PluginLog.print("   - Duration: \(String(format: "%.2f", clipDuration.seconds))s")
             PluginLog.print(
-                "   - Time range in composition: \(String(format: "%.2f", totalDuration.seconds - clipDuration.seconds))s - \(String(format: "%.2f", totalDuration.seconds))s"
+                "   - Time range in composition: \(String(format: "%.2f", insertionTime.seconds))s - \(String(format: "%.2f", CMTimeAdd(insertionTime, clipDuration).seconds))s"
             )
         }
 
@@ -239,32 +245,20 @@ internal class VideoSequenceBuilder {
         PluginLog.print("📊 ===== VIDEO SEQUENCE SUMMARY =====")
         PluginLog.print("   Total clips: \(videoClips.count)")
         PluginLog.print("   Total duration: \(String(format: "%.2f", totalDuration.seconds))s")
-        PluginLog.print("   Max render size: \(maxRenderSize.width) x \(maxRenderSize.height)")
+        PluginLog.print("   Render size: \(maxRenderSize.width) x \(maxRenderSize.height)")
         PluginLog.print("   Max frame rate: \(maxFrameRate) fps")
         PluginLog.print("   Clip instructions: \(clipInstructions.count)")
-
-        // Handle shared audio track - add to result if it has segments, otherwise remove from composition
-        if let audioTrack = sharedAudioTrack {
-            if !audioTrack.segments.isEmpty {
-                originalAudioTracks.append(audioTrack)
-            } else {
-                PluginLog.print("   ⚠️ Shared audio track has no segments - removing from composition")
-                composition.removeTrack(audioTrack)
-            }
-        } else {
-            PluginLog.print("   🔊 AUDIO TRACKS: 0 (no audio track created)")
-        }
-
+        PluginLog.print("   Audio tracks: \(originalAudioTracks.count)")
         PluginLog.print("=====================================")
         PluginLog.print("")
 
         return VideoSequenceResult(
-            videoTrack: compositionVideoTrack,
             audioTracks: originalAudioTracks,
             totalDuration: totalDuration,
             renderSize: maxRenderSize,
             frameRate: maxFrameRate,
-            clipInstructions: clipInstructions
+            clipInstructions: clipInstructions,
+            trackConfigs: trackConfigs
         )
     }
 
@@ -302,16 +296,18 @@ internal struct ClipInstruction {
     let transform: CGAffineTransform
     let naturalSize: CGSize
     let renderSize: CGSize
+    let trackID: CMPersistentTrackID
+    let audioTrackID: CMPersistentTrackID?
 }
 
 /// Result of building a video sequence.
 internal struct VideoSequenceResult {
-    let videoTrack: AVMutableCompositionTrack
     let audioTracks: [AVMutableCompositionTrack]
     let totalDuration: CMTime
     let renderSize: CGSize
     let frameRate: Float
     let clipInstructions: [ClipInstruction]
+    let trackConfigs: [CMPersistentTrackID: VideoClip]
 }
 
 /// Holds the data needed to construct an AVMutableVideoComposition without
@@ -346,12 +342,12 @@ internal class CustomVideoCompositionInstruction: NSObject, AVVideoCompositionIn
 
     init(
         timeRange: CMTimeRange,
-        sourceTrackID: CMPersistentTrackID,
+        sourceTrackIDs: [CMPersistentTrackID],
         layerInstructions: [AVVideoCompositionLayerInstruction],
         backgroundColor: CGColor? = nil
     ) {
         self.timeRange = timeRange
-        self._requiredSourceTrackIDs = [NSNumber(value: sourceTrackID)]
+        self._requiredSourceTrackIDs = sourceTrackIDs.map { NSNumber(value: $0) }
         self.layerInstructions = layerInstructions
         self.backgroundColor = backgroundColor
         super.init()
diff --git a/ios/Classes/src/features/render/models/RenderConfig.swift b/ios/Classes/src/features/render/models/RenderConfig.swift
index 2e8b12d..d274b08 100644
--- a/ios/Classes/src/features/render/models/RenderConfig.swift
+++ b/ios/Classes/src/features/render/models/RenderConfig.swift
@@ -212,6 +212,12 @@ struct RenderConfig {
     /// Global end time in microseconds for trimming the final composition
     let endUs: Int64?
 
+    /// Target render width
+    let renderWidth: Double?
+
+    /// Target render height
+    let renderHeight: Double?
+
     /// Whether to optimize the video for network streaming (fast start).
     /// When true, moves the moov atom to the beginning of the file.
     let shouldOptimizeForNetworkUse: Bool
@@ -248,6 +254,8 @@ struct RenderConfig {
             blur: self.blur,
             startUs: self.startUs,
             endUs: self.endUs,
+            renderWidth: self.renderWidth,
+            renderHeight: self.renderHeight,
             shouldOptimizeForNetworkUse: self.shouldOptimizeForNetworkUse,
             imageBytesWithCropping: self.imageBytesWithCropping
         )
@@ -269,7 +277,14 @@ struct RenderConfig {
                     inputPath: inputPath,
                     startUs: (clipMap["startUs"] as? NSNumber)?.int64Value,
                     endUs: (clipMap["endUs"] as? NSNumber)?.int64Value,
-                    volume: (clipMap["volume"] as? NSNumber)?.floatValue
+                    volume: (clipMap["volume"] as? NSNumber)?.floatValue,
+                    opacity: (clipMap["opacity"] as? NSNumber)?.doubleValue,
+                    x: (clipMap["x"] as? NSNumber)?.doubleValue,
+                    y: (clipMap["y"] as? NSNumber)?.doubleValue,
+                    width: (clipMap["width"] as? NSNumber)?.doubleValue,
+                    height: (clipMap["height"] as? NSNumber)?.doubleValue,
+                    segmentTimeUs: (clipMap["segmentTimeUs"] as? NSNumber)?.int64Value,
+                    zIndex: clipMap["zIndex"] as? Int
                 )
             }
         }
@@ -320,6 +335,8 @@ struct RenderConfig {
             blur: (args["blur"] as? NSNumber)?.doubleValue,
             startUs: (args["startUs"] as? NSNumber)?.int64Value,
             endUs: (args["endUs"] as? NSNumber)?.int64Value,
+            renderWidth: (args["renderWidth"] as? NSNumber)?.doubleValue,
+            renderHeight: (args["renderHeight"] as? NSNumber)?.doubleValue,
             shouldOptimizeForNetworkUse: args["shouldOptimizeForNetworkUse"] as? Bool ?? true,
             imageBytesWithCropping: args["imageBytesWithCropping"] as? Bool ?? false
         )
diff --git a/ios/Classes/src/features/render/models/VideoClip.swift b/ios/Classes/src/features/render/models/VideoClip.swift
index 6241247..dd59540 100644
--- a/ios/Classes/src/features/render/models/VideoClip.swift
+++ b/ios/Classes/src/features/render/models/VideoClip.swift
@@ -1,16 +1,44 @@
 import Foundation
 
-/// Represents a video clip with optional trimming and volume control
+/// Represents a video clip with optional trimming, volume control, and positioning
 internal struct VideoClip {
     let inputPath: String
     let startUs: Int64?
     let endUs: Int64?
     let volume: Float?
+    let opacity: Double?
 
-    init(inputPath: String, startUs: Int64? = nil, endUs: Int64? = nil, volume: Float? = nil) {
+    // New fields for composition support
+    let x: Double?
+    let y: Double?
+    let width: Double?
+    let height: Double?
+    let segmentTimeUs: Int64?
+    let zIndex: Int?
+
+    init(
+        inputPath: String,
+        startUs: Int64? = nil,
+        endUs: Int64? = nil,
+        volume: Float? = nil,
+        opacity: Double? = nil,
+        x: Double? = nil,
+        y: Double? = nil,
+        width: Double? = nil,
+        height: Double? = nil,
+        segmentTimeUs: Int64? = nil,
+        zIndex: Int? = nil
+    ) {
         self.inputPath = inputPath
         self.startUs = startUs
         self.endUs = endUs
         self.volume = volume
+        self.opacity = opacity
+        self.x = x
+        self.y = y
+        self.width = width
+        self.height = height
+        self.segmentTimeUs = segmentTimeUs
+        self.zIndex = zIndex
     }
 }
diff --git a/ios/Classes/src/features/render/models/VideoCompositorConfig.swift b/ios/Classes/src/features/render/models/VideoCompositorConfig.swift
index 374b5a0..f3ea65c 100644
--- a/ios/Classes/src/features/render/models/VideoCompositorConfig.swift
+++ b/ios/Classes/src/features/render/models/VideoCompositorConfig.swift
@@ -35,4 +35,12 @@ struct VideoCompositorConfig {
     /// Fallback source track ID for older iOS versions where sourceTrackIDs may be empty.
     /// This is used when the custom compositor doesn't receive track IDs properly.
     var sourceTrackID: CMPersistentTrackID = kCMPersistentTrackID_Invalid
+
+    /// Mapping of track ID to video clip configuration for multi-track compositing
+    var videoClipConfigs: [CMPersistentTrackID: VideoClip] = [:]
+
+    /// The intended render size of the composition (logical coordinate space).
+    /// This is used to calculate scale factors if the actual render context size
+    /// differs from the intended size (e.g. due to AVAssetExportSession presets).
+    var intendedRenderSize: CGSize = .zero
 }
diff --git a/ios/Classes/src/features/render/utils/VideoCompositor.swift b/ios/Classes/src/features/render/utils/VideoCompositor.swift
index 21fe06f..4d96cac 100644
--- a/ios/Classes/src/features/render/utils/VideoCompositor.swift
+++ b/ios/Classes/src/features/render/utils/VideoCompositor.swift
@@ -34,12 +34,15 @@ class VideoCompositor: NSObject, AVVideoCompositing {
     var cropWidth: CGFloat?
     var cropHeight: CGFloat?
 
-    // New properties for handling iPhone orientation
     var originalNaturalSize: CGSize = .zero
+    var intendedRenderSize: CGSize = .zero
 
     /// Fallback source track ID for older iOS versions
     var sourceTrackID: CMPersistentTrackID = kCMPersistentTrackID_Invalid
 
+    /// Track configurations for multi-track compositing
+    var videoClipConfigs: [CMPersistentTrackID: VideoClip] = [:]
+
     /// Color filter configs for per-frame LUT computation
     private var colorFilterConfigs: [ColorFilterConfig] = []
 
@@ -58,7 +61,6 @@ class VideoCompositor: NSObject, AVVideoCompositing {
     var videoRotationDegrees: Double = 0.0
     var shouldApplyOrientationCorrection: Bool = false
 
-    // Update the apply function:
     func apply(_ config: VideoCompositorConfig) {
         self.blurSigma = config.blurSigma
         self.rotateRadians = config.rotateRadians
@@ -77,7 +79,9 @@ class VideoCompositor: NSObject, AVVideoCompositing {
         self.videoRotationDegrees = config.videoRotationDegrees
         self.shouldApplyOrientationCorrection = config.shouldApplyOrientationCorrection
         self.originalNaturalSize = config.originalNaturalSize
+        self.intendedRenderSize = config.intendedRenderSize
         self.sourceTrackID = config.sourceTrackID
+        self.videoClipConfigs = config.videoClipConfigs
 
         self.setOverlayImageLayers(from: config.imageLayerConfigs)
         self.colorFilterConfigs = config.colorFilterConfigs
@@ -178,326 +182,186 @@ class VideoCompositor: NSObject, AVVideoCompositing {
     func renderContextChanged(_ newRenderContext: AVVideoCompositionRenderContext) {}
 
     func startRequest(_ request: AVAsynchronousVideoCompositionRequest) {
-        // Try to get source buffer from the first available track
-        var sourceBuffer: CVPixelBuffer?
-
-        if !request.sourceTrackIDs.isEmpty {
-            sourceBuffer = request.sourceFrame(byTrackID: request.sourceTrackIDs[0].int32Value)
-        }
-
-        // Fallback 1: Try to get track ID from layer instruction if sourceTrackIDs is empty
-        // This can happen on older iOS versions (iPhone 7, iOS 15)
-        if sourceBuffer == nil,
-            let instruction = request.videoCompositionInstruction
-                as? CustomVideoCompositionInstruction,
-            let layerInstruction = instruction.layerInstructions.first
-        {
-            let trackID = layerInstruction.trackID
-            if trackID != kCMPersistentTrackID_Invalid {
-                sourceBuffer = request.sourceFrame(byTrackID: trackID)
+        let renderSize = request.renderContext.size
+        let currentTimeUs = Int64(CMTimeGetSeconds(request.compositionTime) * 1_000_000)
+
+        // Calculate scale factors between intended logical resolution and actual render size.
+        // This handles cases where AVAssetExportSession forces a different resolution
+        // (e.g. 1080p preset for a 4K composition).
+        let scaleFactorX = intendedRenderSize.width > 0 ? renderSize.width / intendedRenderSize.width : 1.0
+        let scaleFactorY = intendedRenderSize.height > 0 ? renderSize.height / intendedRenderSize.height : 1.0
+
+        // 1. Define a common structure for all renderable items
+        enum RenderableItem {
+            case video(image: CIImage, clip: VideoClip, trackID: CMPersistentTrackID)
+            case imageLayer(layer: ImageLayer)
+
+            var zIndex: Int {
+                switch self {
+                    case .video(_, let clip, _): return clip.zIndex ?? 0
+                    case .imageLayer: return Int.max
+                }
             }
         }
 
-        // Fallback 2: Use the pre-configured sourceTrackID from VideoCompositorConfig
-        // This is set during composition building and guarantees we have the correct track ID
-        if sourceBuffer == nil && sourceTrackID != kCMPersistentTrackID_Invalid {
-            sourceBuffer = request.sourceFrame(byTrackID: sourceTrackID)
-        }
-
-        guard let sourceBuffer = sourceBuffer else {
-            request.finish(
-                with: NSError(
-                    domain: "VideoCompositor", code: 0,
-                    userInfo: [
-                        NSLocalizedDescriptionKey:
-                            "No source tracks available for compositing (sourceTrackIDs: \(request.sourceTrackIDs.count), configTrackID: \(sourceTrackID))"
-                    ]))
-            return
-        }
-        var outputImage = CIImage(cvPixelBuffer: sourceBuffer)
-
-        // Apply layer instruction transform first (video scaling/centering/rotation)
-        // This ensures all videos are properly sized and oriented before applying user effects.
-        // The layerInstruction contains the preferredTransform which already handles video rotation
-        // from portrait to landscape or vice versa, so no additional orientation correction is needed.
-        //
-        // IMPORTANT: AVFoundation uses a top-left origin coordinate system (Y points down),
-        // while CIImage uses a bottom-left origin (Y points up). We need to convert the transform
-        // to work correctly with CIImage's coordinate system.
-
-        // Extract layer instruction from CustomVideoCompositionInstruction
-        var layerInstruction: AVVideoCompositionLayerInstruction?
-        if let customInstruction = request.videoCompositionInstruction
-            as? CustomVideoCompositionInstruction,
-            let firstLayerInstruction = customInstruction.layerInstructions.first
-        {
-            layerInstruction = firstLayerInstruction
-        }
-
-        if let layerInstruction = layerInstruction {
-            var startTransform = CGAffineTransform.identity
-            var endTransform = CGAffineTransform.identity
-            var timeRange = CMTimeRange.zero
-
-            // Get the transform at the current composition time
-            let hasTransform = layerInstruction.getTransformRamp(
-                for: request.compositionTime,
-                start: &startTransform,
-                end: &endTransform,
-                timeRange: &timeRange
-            )
-
-            if hasTransform && !startTransform.isIdentity {
-                // Convert AVFoundation transform to CIImage coordinate system:
-                // 1. Flip Y axis before transform (go from CIImage coords to AVFoundation coords)
-                // 2. Apply the AVFoundation transform
-                // 3. Flip Y axis after transform (go back to CIImage coords)
-                let imageHeight = outputImage.extent.height
-
-                // Flip Y: translate to top, scale Y by -1
-                let flipY = CGAffineTransform(scaleX: 1, y: -1)
-                    .translatedBy(x: 0, y: -imageHeight)
-
-                // Convert transform: flipY * transform * flipY^-1
-                // But since flipY is its own inverse (when combined with translate), we use:
-                // result = flipY * transform * flipY (adjusted for new height after transform)
-                let convertedTransform =
-                    flipY
-                    .concatenating(startTransform)
-
-                outputImage = outputImage.transformed(by: convertedTransform)
-
-                // After transform, we need to flip back and normalize
-                let transformedExtent = outputImage.extent
-                let newHeight = transformedExtent.height
-                let flipBack = CGAffineTransform(scaleX: 1, y: -1)
-                    .translatedBy(x: 0, y: -newHeight)
-
-                outputImage = outputImage.transformed(by: flipBack)
-
-                // Normalize position to origin
-                let finalExtent = outputImage.extent
-                if finalExtent.origin.x != 0 || finalExtent.origin.y != 0 {
-                    let translation = CGAffineTransform(
-                        translationX: -finalExtent.origin.x,
-                        y: -finalExtent.origin.y
-                    )
-                    outputImage = outputImage.transformed(by: translation)
+        var items: [RenderableItem] = []
+
+        // 2. Collect active video frames
+        for trackIDValue in request.sourceTrackIDs {
+            let trackID = trackIDValue.int32Value
+            if let sourceBuffer = request.sourceFrame(byTrackID: trackID),
+               let clipConfig = videoClipConfigs[trackID] {
+
+                var frameImage = CIImage(cvPixelBuffer: sourceBuffer)
+
+                // Apply individual track transform from layer instructions
+                if let customInstruction = request.videoCompositionInstruction as? CustomVideoCompositionInstruction {
+                    for layerInstruction in customInstruction.layerInstructions {
+                        if layerInstruction.trackID == trackID {
+                            var startTransform = CGAffineTransform.identity
+                            var endTransform = CGAffineTransform.identity
+                            var timeRange = CMTimeRange.zero
+
+                            let hasTransform = layerInstruction.getTransformRamp(
+                                for: request.compositionTime,
+                                start: &startTransform,
+                                end: &endTransform,
+                                timeRange: &timeRange
+                            )
+
+                            if hasTransform && !startTransform.isIdentity {
+                                let imageHeight = frameImage.extent.height
+                                let flipY = CGAffineTransform(scaleX: 1, y: -1).translatedBy(x: 0, y: -imageHeight)
+                                let convertedTransform = flipY.concatenating(startTransform)
+                                frameImage = frameImage.transformed(by: convertedTransform)
+
+                                let transformedExtent = frameImage.extent
+                                let flipBack = CGAffineTransform(scaleX: 1, y: -1).translatedBy(x: 0, y: -transformedExtent.height)
+                                frameImage = frameImage.transformed(by: flipBack)
+
+                                // Normalize
+                                let finalExtent = frameImage.extent
+                                if finalExtent.origin.x != 0 || finalExtent.origin.y != 0 {
+                                    frameImage = frameImage.transformed(by: CGAffineTransform(translationX: -finalExtent.origin.x, y: -finalExtent.origin.y))
+                                }
+                            }
+                            break
+                        }
+                    }
                 }
+
+                items.append(.video(image: frameImage, clip: clipConfig, trackID: trackID))
             }
         }
 
-        var center = CGPoint(x: outputImage.extent.midX, y: outputImage.extent.midY)
-
-        // Apply user-defined effects (crop, rotation, flip, scale)
-        var transform = CGAffineTransform.identity
-
-        // Apply LUT, blur, and flip BEFORE overlay when imageBytesWithCropping is enabled
-        // This ensures these effects only affect the video, not the overlay
-        if imageBytesWithCropping {
-            // Apply color filter (timed LUT) to video only
-            outputImage = applyColorFilter(to: outputImage, at: request.compositionTime)
-
-            // Apply blur to video only
-            if blurSigma > 0 {
-                outputImage = outputImage.applyingGaussianBlur(sigma: blurSigma)
+        // 3. Collect active image layers
+        for layer in overlayImageLayers {
+            let inRange = (layer.startUs == -1 || currentTimeUs >= layer.startUs) && (layer.endUs == -1 || currentTimeUs <= layer.endUs)
+            if inRange {
+                items.append(.imageLayer(layer: layer))
             }
+        }
 
-            // Apply flip to video only (before adding overlay)
-            if flipX || flipY {
-                let flipScaleX: CGFloat = flipX ? -1 : 1
-                let flipScaleY: CGFloat = flipY ? -1 : 1
-
-                let flipTransform = CGAffineTransform(translationX: center.x, y: center.y)
-                    .scaledBy(x: flipScaleX, y: flipScaleY)
-                    .translatedBy(x: -center.x, y: -center.y)
-
-                outputImage = outputImage.transformed(by: flipTransform)
-
-                // Normalize position after flip
-                let flippedExtent = outputImage.extent
-                if flippedExtent.origin.x != 0 || flippedExtent.origin.y != 0 {
-                    let translation = CGAffineTransform(
-                        translationX: -flippedExtent.origin.x,
-                        y: -flippedExtent.origin.y
-                    )
-                    outputImage = outputImage.transformed(by: translation)
-                }
-                center = CGPoint(x: outputImage.extent.midX, y: outputImage.extent.midY)
-            }
+        if items.isEmpty {
+            PluginLog.print("⚠️ VideoCompositor: No active items found at time \(request.compositionTime.seconds)s")
+            request.finish(with: NSError(domain: "VideoCompositor", code: 0, userInfo: [NSLocalizedDescriptionKey: "No active items found"]))
+            return
         }
 
-        // Apply overlay BEFORE crop if imageBytesWithCropping is enabled
-        if imageBytesWithCropping {
-            let imageRect = outputImage.extent
-
-            // Apply time-based overlay layers
-            let currentTimeUs = Int64(CMTimeGetSeconds(request.compositionTime) * 1_000_000)
-            for layer in overlayImageLayers {
-                // Check if current time is within the layer's time range
-                // startUs of -1 means "from the start of the video"
-                // endUs of -1 means "until the end of the video"
-                let inTimeRange =
-                    (layer.startUs == -1 || currentTimeUs >= layer.startUs)
-                    && (layer.endUs == -1 || currentTimeUs <= layer.endUs)
-
-                if inTimeRange {
-                    var img = layer.image
-
-                    if let w = layer.width, let h = layer.height {
-                        let sx = CGFloat(w) / img.extent.width
-                        let sy = CGFloat(h) / img.extent.height
-                        img = img.transformed(by: CGAffineTransform(scaleX: sx, y: sy))
-                    }
+        // 4. Sort all items by zIndex
+        let sortedItems = items.sorted { $0.zIndex < $1.zIndex }
+
+        // 5. Initialize background image (black frame)
+        var outputImage = CIImage(color: .black).cropped(to: CGRect(origin: .zero, size: renderSize))
+
+        // 6. Composite each item
+        for item in sortedItems {
+            switch item {
+            case .video(let img, let clip, _):
+                var frameImg = img
+
+                // Apply custom size if provided, otherwise scale by global factor
+                if let w = clip.width, let h = clip.height {
+                    let targetW = CGFloat(w) * scaleFactorX
+                    let targetH = CGFloat(h) * scaleFactorY
+                    let sx = targetW / frameImg.extent.width
+                    let sy = targetH / frameImg.extent.height
+                    frameImg = frameImg.transformed(by: CGAffineTransform(scaleX: sx, y: sy))
+                } else if scaleFactorX != 1.0 || scaleFactorY != 1.0 {
+                    frameImg = frameImg.transformed(by: CGAffineTransform(scaleX: scaleFactorX, y: scaleFactorY))
+                }
 
-                    let overlay: CIImage
-                    if layer.x == nil && layer.y == nil {
-                        // Stretch to fill frame when no position is specified
-                        overlay = img.transformed(
-                            by: CGAffineTransform(
-                                scaleX: imageRect.width / img.extent.width,
-                                y: imageRect.height / img.extent.height))
-                    } else {
-                        // Position at specific coordinates
-                        let posX = CGFloat(layer.x ?? 0)
-                        let posY = CGFloat(layer.y ?? 0)
-                        // Convert y from top-left (Dart) to bottom-left (Core Graphics)
-                        let cgY = imageRect.height - posY - img.extent.height
-                        overlay = img.transformed(
-                            by: CGAffineTransform(translationX: posX, y: cgY))
+                // Apply custom offset if provided
+                if clip.x != nil || clip.y != nil {
+                    let posX = CGFloat(clip.x ?? 0) * scaleFactorX
+                    let posY = CGFloat(clip.y ?? 0) * scaleFactorY
+                    // Convert from top-left (Flutter) to bottom-left (Core Image)
+                    let cgY = renderSize.height - posY - frameImg.extent.height
+                    frameImg = frameImg.transformed(by: CGAffineTransform(translationX: posX, y: cgY))
+                } else if scaleFactorX != 1.0 || scaleFactorY != 1.0 {
+                    // Normalize position if we scaled but didn't translate manually
+                    let extent = frameImg.extent
+                    if extent.origin.x != 0 || extent.origin.y != 0 {
+                        frameImg = frameImg.transformed(by: CGAffineTransform(translationX: -extent.origin.x, y: -extent.origin.y))
                     }
-
-                    let (opacity, animTransform) = computeAnimation(
-                        layer: layer,
-                        currentTimeUs: currentTimeUs,
-                        overlayExtent: overlay.extent,
-                        frameExtent: imageRect
-                    )
-                    outputImage = compositeOverlay(
-                        overlay, over: outputImage, opacity: opacity, transform: animTransform)
                 }
-            }
-        }
-
-        // Cropping
-        if cropX != 0 || cropY != 0 || cropWidth != nil || cropHeight != nil {
-            let inputExtent = outputImage.extent
-            let videoWidth = inputExtent.width
-            let videoHeight = inputExtent.height
 
-            let x = cropX
-            var y = cropY
-            let width = cropWidth ?? (videoWidth - x)
-            let height = cropHeight ?? (videoHeight - y)
+                // Apply opacity if needed
+                if let opacity = clip.opacity, opacity < 1.0 {
+                    frameImg = frameImg.applyingFilter("CIColorMatrix", parameters: [
+                        "inputAVector": CIVector(x: 0, y: 0, z: 0, w: CGFloat(opacity)),
+                    ])
+                }
 
-            y = videoHeight - height - y
+                outputImage = frameImg.composited(over: outputImage)
 
-            let cropRect = CGRect(x: x, y: y, width: width, height: height)
+            case .imageLayer(let layer):
+                var layerImg = layer.image
+                if let w = layer.width, let h = layer.height {
+                    let targetW = CGFloat(w) * scaleFactorX
+                    let targetH = CGFloat(h) * scaleFactorY
+                    layerImg = layerImg.transformed(by: CGAffineTransform(scaleX: targetW/layerImg.extent.width, y: targetH/layerImg.extent.height))
+                }
 
-            outputImage = outputImage.cropped(to: cropRect)
-            outputImage = outputImage.transformed(
-                by: CGAffineTransform(
-                    translationX: -cropRect.origin.x,
-                    y: -cropRect.origin.y
+                let overlay: CIImage
+                if layer.x == nil && layer.y == nil {
+                    overlay = layerImg.transformed(by: CGAffineTransform(scaleX: renderSize.width/layerImg.extent.width, y: renderSize.height/layerImg.extent.height))
+                } else {
+                    let posX = CGFloat(layer.x ?? 0) * scaleFactorX
+                    let posY = CGFloat(layer.y ?? 0) * scaleFactorY
+                    let cgY = renderSize.height - posY - layerImg.extent.height
+                    overlay = layerImg.transformed(by: CGAffineTransform(translationX: posX, y: cgY))
+                }
 
-                ))
-            center = CGPoint(x: outputImage.extent.midX, y: outputImage.extent.midY)
+                let (opacity, animTransform) = computeAnimation(layer: layer, currentTimeUs: currentTimeUs, overlayExtent: overlay.extent, frameExtent: CGRect(origin: .zero, size: renderSize))
+                outputImage = compositeOverlay(overlay, over: outputImage, opacity: opacity, transform: animTransform)
+            }
         }
 
-        // Rotation
-        if rotateRadians != 0 {
-            // Rotate the image
-            let rotation = CGAffineTransform(rotationAngle: rotateRadians)
-            let rotatedImage = outputImage.transformed(by: rotation)
-
-            // Get the new bounding box after rotation
-            let rotatedExtent = rotatedImage.extent
-
-            // Translate to (0, 0)
-            let translation = CGAffineTransform(
-                translationX: -rotatedExtent.origin.x, y: -rotatedExtent.origin.y)
-            outputImage = rotatedImage.transformed(by: translation)
-            center = CGPoint(x: outputImage.extent.midX, y: outputImage.extent.midY)
-        }
+        // 7. Apply global effects (if any)
+        let center = CGPoint(x: outputImage.extent.midX, y: outputImage.extent.midY)
+        var transform = CGAffineTransform.identity
 
-        // Flipping (only if NOT imageBytesWithCropping - otherwise already applied before overlay)
-        if !imageBytesWithCropping && (flipX || flipY) {
+        // Apply flip (Global)
+        if flipX || flipY {
             let scaleX: CGFloat = flipX ? -1 : 1
             let scaleY: CGFloat = flipY ? -1 : 1
-
-            let flipTransform = CGAffineTransform(translationX: center.x, y: center.y)
+            transform = transform.concatenating(CGAffineTransform(translationX: center.x, y: center.y)
                 .scaledBy(x: scaleX, y: scaleY)
-                .translatedBy(x: -center.x, y: -center.y)
-
-            transform = transform.concatenating(flipTransform)
+                .translatedBy(x: -center.x, y: -center.y))
         }
 
-        // Apply Scale
+        // Apply Global Scale
         if scaleX != 1 || scaleY != 1 {
             transform = transform.scaledBy(x: scaleX, y: scaleY)
         }
 
         outputImage = outputImage.transformed(by: transform)
 
-        // Apply color filter (only if NOT imageBytesWithCropping - otherwise already applied before overlay)
-        if !imageBytesWithCropping {
-            outputImage = applyColorFilter(to: outputImage, at: request.compositionTime)
-
-            // Apply blur
-            if blurSigma > 0 {
-                outputImage = outputImage.applyingGaussianBlur(sigma: blurSigma)
-            }
-        }
-
-        // Apply overlay image layers (only if not already applied before crop)
-        if !imageBytesWithCropping {
-            let imageRect = outputImage.extent
-
-            // Apply time-based overlay layers with positioning
-            let currentTimeUs = Int64(CMTimeGetSeconds(request.compositionTime) * 1_000_000)
-            for layer in overlayImageLayers {
-                // Check if current time is within the layer's time range
-                // startUs of -1 means "from the start of the video"
-                // endUs of -1 means "until the end of the video"
-                let inTimeRange =
-                    (layer.startUs == -1 || currentTimeUs >= layer.startUs)
-                    && (layer.endUs == -1 || currentTimeUs <= layer.endUs)
-                if inTimeRange {
-                    var img = layer.image
-
-                    if let w = layer.width, let h = layer.height {
-                        let sx = CGFloat(w) / img.extent.width
-                        let sy = CGFloat(h) / img.extent.height
-                        img = img.transformed(by: CGAffineTransform(scaleX: sx, y: sy))
-                    }
-
-                    let overlay: CIImage
-                    if layer.x == nil && layer.y == nil {
-                        // Stretch to fill frame when no position is specified
-                        overlay = img.transformed(
-                            by: CGAffineTransform(
-                                scaleX: imageRect.width / img.extent.width,
-                                y: imageRect.height / img.extent.height))
-                    } else {
-                        // Position at specific coordinates
-                        let posX = CGFloat(layer.x ?? 0)
-                        let posY = CGFloat(layer.y ?? 0)
-                        // Convert y from top-left (Dart) to bottom-left (Core Graphics)
-                        let cgY = imageRect.height - posY - img.extent.height
-                        overlay = img.transformed(
-                            by: CGAffineTransform(translationX: posX, y: cgY))
-                    }
-
-                    let (opacity, animTransform) = computeAnimation(
-                        layer: layer,
-                        currentTimeUs: currentTimeUs,
-                        overlayExtent: overlay.extent,
-                        frameExtent: imageRect
-                    )
-                    outputImage = compositeOverlay(
-                        overlay, over: outputImage, opacity: opacity, transform: animTransform)
-                }
-            }
+        // Apply LUT and Blur (Global)
+        outputImage = applyColorFilter(to: outputImage, at: request.compositionTime)
+        if blurSigma > 0 {
+            outputImage = outputImage.applyingGaussianBlur(sigma: blurSigma)
         }
 
         guard let outputBuffer = request.renderContext.newPixelBuffer() else {
diff --git a/lib/core/models/video/video_render_data_model.dart b/lib/core/models/video/video_render_data_model.dart
index 7399245..3e270a6 100644
--- a/lib/core/models/video/video_render_data_model.dart
+++ b/lib/core/models/video/video_render_data_model.dart
@@ -408,25 +408,6 @@ class VideoRenderData {
     double? scaleX = transform.scaleX;
     double? scaleY = transform.scaleY;
 
-    // Handle quality config
-    if (qualityConfig != null && scaleX == null && scaleY == null) {
-      final targetVideo = video ??
-          (videoSegments != null && videoSegments!.isNotEmpty
-              ? videoSegments!.first.video
-              : null);
-      if (targetVideo != null) {
-        final meta = await ProVideoEditor.instance.getMetadata(targetVideo);
-        final originalResolution = meta.resolution;
-        final targetResolution =
-            qualityConfig!.resolution ?? originalResolution;
-        final sx = targetResolution.width / originalResolution.width;
-        final sy = targetResolution.height / originalResolution.height;
-        final scale = sx < sy ? sx : sy;
-        scaleX = scale;
-        scaleY = scale;
-      }
-    }
-
     // Convert video clips to map format
     // ignore: deprecated_member_use_from_same_package
     final fallbackVolume = originalAudioVolume;
@@ -536,6 +517,8 @@ class VideoRenderData {
       'bitrate': bitrate,
       'scaleX': scaleX,
       'scaleY': scaleY,
+      'renderWidth': qualityConfig?.resolution?.width,
+      'renderHeight': qualityConfig?.resolution?.height,
       // Global trim for entire composition (only for videoSegments,
       // not single video). For single video, startTime/endTime are already
       // applied to the clip itself
diff --git a/lib/core/models/video/video_segment_model.dart b/lib/core/models/video/video_segment_model.dart
index c927e0d..2f31b5b 100644
--- a/lib/core/models/video/video_segment_model.dart
+++ b/lib/core/models/video/video_segment_model.dart
@@ -1,5 +1,6 @@
 // ignore_for_file: public_member_api_docs, sort_constructors_first
 import 'dart:convert';
+import 'dart:ui';
 
 import 'package:pro_video_editor/pro_video_editor.dart';
 import 'package:pro_video_editor/shared/utils/parser/double_parser.dart';
@@ -16,6 +17,11 @@ class VideoSegment {
     this.startTime,
     this.endTime,
     this.volume,
+    this.offset,
+    this.size,
+    this.zIndex,
+    this.opacity,
+    this.segmentTime,
   })  : assert(
           startTime == null || endTime == null || startTime < endTime,
           'startTime must be before endTime',
@@ -50,6 +56,42 @@ class VideoSegment {
   /// If null, the original volume is used.
   final double? volume;
 
+  /// The stacking order of overlapping elements along the z-axis.
+  /// Segments with higher zIndex are rendered on top.
+  /// Defaulted to 0 if not specified.
+  ///
+  /// If multiple segments have the same zIndex, then their order is used
+  /// (the latter segment is higher)
+  ///
+  final int? zIndex;
+
+  /// The opacity of of this video segments used with overlapping elements
+  final double? opacity;
+
+  /// Position offset from the top-left corner of the video frame, in pixels.
+  ///
+  /// [Offset.dx] is the horizontal offset from the left edge.
+  /// [Offset.dy] is the vertical offset from the top edge.
+  ///
+  /// When `null`, the image is stretched to fill the entire video frame.
+  /// When set to a specific value (e.g., [Offset.zero]), the image is
+  /// placed at that position at its original size.
+  final Offset? offset;
+
+  /// The display size of the image layer, in pixels.
+  ///
+  /// [Size.width] is the target width of the image.
+  /// [Size.height] is the target height of the image.
+  ///
+  /// When `null`, the image is used at its original size (or stretched to
+  /// fill the frame when [offset] is also `null`).
+  final Size? size;
+
+  /// Optional start time for this video segment in the rendered video
+  ///
+  /// If null, the clip will start right after the previous video segment
+  final Duration? segmentTime;
+
   /// Converts this clip to a map for platform channel communication.
   Future<Map<String, dynamic>> toAsyncMap() async {
     final inputPath = await video.safeFilePath();
@@ -59,6 +101,13 @@ class VideoSegment {
       'startUs': startTime?.inMicroseconds,
       'endUs': endTime?.inMicroseconds,
       'volume': volume,
+      'zIndex': zIndex,
+      'opacity': opacity,
+      'x': offset?.dx,
+      'y': offset?.dy,
+      'width': size?.width,
+      'height': size?.height,
+      'segmentTimeUs': segmentTime?.inMicroseconds,
     };
   }
 
@@ -68,12 +117,22 @@ class VideoSegment {
     Duration? startTime,
     Duration? endTime,
     double? volume,
+    double? opacity,
+    int? zIndex,
+    Offset? offset,
+    Size? size,
+    Duration? segmentTime,
   }) {
     return VideoSegment(
       video: video ?? this.video,
       startTime: startTime ?? this.startTime,
       endTime: endTime ?? this.endTime,
       volume: volume ?? this.volume,
+      zIndex: zIndex ?? this.zIndex,
+      opacity: opacity ?? this.opacity,
+      offset: offset ?? this.offset,
+      size: size ?? this.size,
+      segmentTime: segmentTime ?? this.segmentTime,
     );
   }
 
@@ -84,7 +143,12 @@ class VideoSegment {
     return other.video == video &&
         other.startTime == startTime &&
         other.endTime == endTime &&
-        other.volume == volume;
+        other.volume == volume &&
+        other.zIndex == zIndex &&
+        other.opacity == opacity &&
+        other.offset == offset &&
+        other.size == size &&
+        other.segmentTime == segmentTime;
   }
 
   @override
@@ -92,7 +156,12 @@ class VideoSegment {
     return video.hashCode ^
         startTime.hashCode ^
         endTime.hashCode ^
-        volume.hashCode;
+        volume.hashCode ^
+        zIndex.hashCode ^
+        opacity.hashCode ^
+        offset.hashCode ^
+        size.hashCode ^
+        segmentTime.hashCode;
   }
 
   @override
@@ -100,7 +169,12 @@ class VideoSegment {
     return 'VideoSegment(video: $video, '
         'startTime: $startTime, '
         'endTime: $endTime, '
-        'volume: $volume)';
+        'volume: $volume, '
+        'zIndex: $zIndex, '
+        'opacity: $opacity, '
+        'offset: $offset, '
+        'size: $size, '
+        'segmentTime: $segmentTime)';
   }
 
   Map<String, dynamic> toMap() {
@@ -109,6 +183,12 @@ class VideoSegment {
       'startTime': startTime?.inMicroseconds,
       'endTime': endTime?.inMicroseconds,
       'volume': volume,
+      'zIndex': zIndex,
+      'opacity': opacity,
+      'offset': offset != null ? {'dx': offset!.dx, 'dy': offset!.dy} : null,
+      'size':
+          size != null ? {'width': size!.width, 'height': size!.height} : null,
+      'segmentTime': segmentTime?.inMicroseconds,
     };
   }
 
@@ -122,6 +202,23 @@ class VideoSegment {
           ? Duration(microseconds: safeParseInt(map['endTime']))
           : null,
       volume: tryParseDouble(map['volume']),
+      zIndex: tryParseInt(map['zIndex']),
+      opacity: tryParseDouble(map['opacity']),
+      offset: map['offset'] != null
+          ? Offset(
+              safeParseDouble((map['offset'] as Map<String, dynamic>)['dx']),
+              safeParseDouble((map['offset'] as Map<String, dynamic>)['dy']),
+            )
+          : null,
+      size: map['size'] != null
+          ? Size(
+              safeParseDouble((map['size'] as Map<String, dynamic>)['width']),
+              safeParseDouble((map['size'] as Map<String, dynamic>)['height']),
+            )
+          : null,
+      segmentTime: map['segmentTime'] != null
+          ? Duration(microseconds: safeParseInt(map['segmentTime']))
+          : null,
     );
   }
 
diff --git a/macos/Classes/src/features/render/RenderVideo.swift b/macos/Classes/src/features/render/RenderVideo.swift
index 4c51432..3a95f8b 100644
--- a/macos/Classes/src/features/render/RenderVideo.swift
+++ b/macos/Classes/src/features/render/RenderVideo.swift
@@ -75,7 +75,14 @@ class RenderVideo {
                                 inputPath: newPath,
                                 startUs: clip.startUs,
                                 endUs: clip.endUs,
-                                volume: clip.volume
+                                volume: clip.volume,
+                                opacity: clip.opacity,
+                                x: clip.x,
+                                y: clip.y,
+                                width: clip.width,
+                                height: clip.height,
+                                segmentTimeUs: clip.segmentTimeUs,
+                                zIndex: clip.zIndex
                             )
                         }
                         return clip
@@ -139,13 +146,16 @@ class RenderVideo {
                     var effectsConfig = VideoCompositorConfig()
 
                     // Use composition helper to merge multiple video clips
-                    let (composition, videoCompData, renderSize, audioMix, sourceTrackID) =
+                    let (composition, videoCompData, renderSize, audioMix, sourceTrackID, updatedEffectsConfig) =
                         try await applyComposition(
                             videoClips: workingConfig.videoClips,
                             videoEffects: effectsConfig,
                             enableAudio: workingConfig.enableAudio,
-                            audioTracks: workingConfig.audioTracks
+                            audioTracks: workingConfig.audioTracks,
+                            renderWidth: workingConfig.renderWidth,
+                            renderHeight: workingConfig.renderHeight
                         )
+                    effectsConfig = updatedEffectsConfig
                     var videoCompConfig = videoCompData
 
                     // Set source track ID for fallback on older macOS versions
@@ -236,10 +246,18 @@ class RenderVideo {
                     videoComposition.frameDuration = videoCompConfig.frameDuration
                     videoComposition.renderSize = finalRenderSize
                     videoComposition.instructions = videoCompConfig.instructions
+
+                    // Ensure compositor knows the intended logical size for coordinate mapping
+                    effectsConfig.intendedRenderSize = finalRenderSize
+
                     videoComposition.customVideoCompositorClass = makeVideoCompositorSubclass(
                         with: effectsConfig)
 
-                    let preset = applyBitrate(requestedBitrate: workingConfig.bitrate)
+                    let preset = applyBitrate(
+                        requestedBitrate: workingConfig.bitrate,
+                        renderWidth: workingConfig.renderWidth,
+                        renderHeight: workingConfig.renderHeight
+                    )
 
                     let export = try await prepareExportSession(
                         composition: composition,
diff --git a/macos/Classes/src/features/render/helpers/ApplyBitrate.swift b/macos/Classes/src/features/render/helpers/ApplyBitrate.swift
index 9eaf01b..00a6287 100644
--- a/macos/Classes/src/features/render/helpers/ApplyBitrate.swift
+++ b/macos/Classes/src/features/render/helpers/ApplyBitrate.swift
@@ -9,7 +9,8 @@ import Foundation
 ///
 /// - Parameters:
 ///   - requestedBitrate: Target bitrate in bits per second. If nil, returns highest quality.
-///   - presetHint: Optional preset hint (currently unused).
+///   - renderWidth: Optional target render width to ensure preset supports the resolution.
+///   - renderHeight: Optional target render height to ensure preset supports the resolution.
 /// - Returns: AVAssetExportPreset string matching the requested quality level.
 ///
 /// Bitrate mapping:
@@ -24,14 +25,30 @@ import Foundation
 /// - ≥2 Mbps: 480p
 /// - ≥1 Mbps: Medium quality
 /// - <1 Mbps: Low quality
-public func applyBitrate(requestedBitrate: Int?, presetHint: String? = nil) -> String {
-    if let bitrate = requestedBitrate {
-        PluginLog.print(
-            "[\(Tags.render)] 📊 Requested bitrate: \(bitrate) bps (\(String(format: "%.1f", Double(bitrate) / 1_000_000)) Mbps)"
-        )
-        PluginLog.print(
-            "[\(Tags.render)] ⚠️ AVAssetExportSession does not support custom bitrate directly - using closest preset"
-        )
+public func applyBitrate(
+    requestedBitrate: Int?,
+    renderWidth: Double? = nil,
+    renderHeight: Double? = nil
+) -> String {
+    // If a custom resolution is provided, we should ideally use a "HighestQuality"
+    // preset to avoid resolution constraints from bitrate-based presets.
+    // However, if a bitrate is also specified, we'll try to pick the best matching one.
+    if let rw = renderWidth, let rh = renderHeight {
+        let maxDim = max(rw, rh)
+
+        if maxDim > 3840 {
+            if #available(macOS 12.1, *) {
+                return AVAssetExportPresetHEVC7680x4320
+            }
+        } else if maxDim > 1920 {
+            if #available(macOS 10.13, *) {
+                return AVAssetExportPresetHEVC3840x2160
+            } else {
+                return AVAssetExportPreset3840x2160
+            }
+        } else if maxDim > 1280 {
+            return AVAssetExportPreset1920x1080
+        }
     }
 
     if let bitrate = requestedBitrate {
diff --git a/macos/Classes/src/features/render/helpers/ApplyComposition.swift b/macos/Classes/src/features/render/helpers/ApplyComposition.swift
index d1c28b9..bd1a926 100644
--- a/macos/Classes/src/features/render/helpers/ApplyComposition.swift
+++ b/macos/Classes/src/features/render/helpers/ApplyComposition.swift
@@ -19,18 +19,22 @@ import Foundation
 ///   - CGSize: Final render size (max dimensions from all clips)
 ///   - AVAudioMix?: Audio mix with volume controls (nil if no audio mixing needed)
 ///   - CMPersistentTrackID: The track ID of the video composition track (for fallback on older macOS)
+///   - VideoCompositorConfig: Updated compositor configuration with track info
 ///
 /// - Throws: NSError if video clips are empty, files don't exist, or tracks can't be loaded.
 func applyComposition(
     videoClips: [VideoClip],
     videoEffects: VideoCompositorConfig,
     enableAudio: Bool,
-    audioTracks: [AudioTrackConfig]
+    audioTracks: [AudioTrackConfig],
+    renderWidth: Double? = nil,
+    renderHeight: Double? = nil
 ) async throws -> (
-    AVMutableComposition, VideoCompositionData, CGSize, AVAudioMix?, CMPersistentTrackID
+    AVMutableComposition, VideoCompositionData, CGSize, AVAudioMix?, CMPersistentTrackID, VideoCompositorConfig
 ) {
     return try await CompositionBuilder(videoClips: videoClips, videoEffects: videoEffects)
         .setEnableAudio(enableAudio)
         .setAudioTracks(audioTracks)
+        .setRenderSize(width: renderWidth, height: renderHeight)
         .build()
 }
diff --git a/macos/Classes/src/features/render/helpers/ApplyPlaybackSpeed.swift b/macos/Classes/src/features/render/helpers/ApplyPlaybackSpeed.swift
index 440117b..6d54575 100644
--- a/macos/Classes/src/features/render/helpers/ApplyPlaybackSpeed.swift
+++ b/macos/Classes/src/features/render/helpers/ApplyPlaybackSpeed.swift
@@ -25,9 +25,7 @@ public func applyPlaybackSpeed(
     guard let speed = speed, speed > 0, speed != 1 else { return instructions }
 
     let speedType = speed < 1 ? "slow motion" : "fast forward"
-    PluginLog.print(
-        "[\(Tags.render)] ⚡ Applying playback speed: \(String(format: "%.2f", speed))x (\(speedType))"
-    )
+    PluginLog.print("[\(Tags.render)] ⚡ Applying playback speed: \(String(format: "%.2f", speed))x (\(speedType))")
 
     let multiplier = 1.0 / Double(speed)
 
@@ -44,14 +42,11 @@ public func applyPlaybackSpeed(
             return instruction
         }
         let scaledStart = CMTimeMultiplyByFloat64(custom.timeRange.start, multiplier: multiplier)
-        let scaledDuration = CMTimeMultiplyByFloat64(
-            custom.timeRange.duration, multiplier: multiplier)
-        let trackID =
-            (custom.requiredSourceTrackIDs?.first as? NSNumber)?.int32Value
-            ?? kCMPersistentTrackID_Invalid
+        let scaledDuration = CMTimeMultiplyByFloat64(custom.timeRange.duration, multiplier: multiplier)
+        let trackIDs = custom.requiredSourceTrackIDs?.compactMap { ($0 as? NSNumber)?.int32Value } ?? []
         return CustomVideoCompositionInstruction(
             timeRange: CMTimeRange(start: scaledStart, duration: scaledDuration),
-            sourceTrackID: trackID,
+            sourceTrackIDs: trackIDs,
             layerInstructions: custom.layerInstructions,
             backgroundColor: custom.backgroundColor
         )
diff --git a/macos/Classes/src/features/render/helpers/CompositionBuilder.swift b/macos/Classes/src/features/render/helpers/CompositionBuilder.swift
index 224909d..64b183d 100644
--- a/macos/Classes/src/features/render/helpers/CompositionBuilder.swift
+++ b/macos/Classes/src/features/render/helpers/CompositionBuilder.swift
@@ -12,6 +12,8 @@ internal class CompositionBuilder {
     private let videoEffects: VideoCompositorConfig
     private var enableAudio: Bool = true
     private var audioTracks: [AudioTrackConfig] = []
+    private var renderWidth: Double?
+    private var renderHeight: Double?
 
     /// Initializes builder with configuration.
     ///
@@ -23,6 +25,13 @@ internal class CompositionBuilder {
         self.videoEffects = videoEffects
     }
 
+    /// Sets the target render size.
+    func setRenderSize(width: Double?, height: Double?) -> CompositionBuilder {
+        self.renderWidth = width
+        self.renderHeight = height
+        return self
+    }
+
     /// Enables or disables audio.
     ///
     /// - Parameter enabled: If true, includes original audio from video clips
@@ -46,7 +55,7 @@ internal class CompositionBuilder {
     /// - Returns: Tuple containing composition, video composition, render size, audio mix, and source track ID
     /// - Throws: Error if composition creation fails
     func build() async throws -> (
-        AVMutableComposition, VideoCompositionData, CGSize, AVAudioMix?, CMPersistentTrackID
+        AVMutableComposition, VideoCompositionData, CGSize, AVAudioMix?, CMPersistentTrackID, VideoCompositorConfig
     ) {
         guard !videoClips.isEmpty else {
             throw NSError(
@@ -64,9 +73,14 @@ internal class CompositionBuilder {
         // Build video sequence
         let videoBuilder = VideoSequenceBuilder(videoClips: videoClips)
             .setEnableAudio(enableAudio)
+            .setRenderSize(width: renderWidth, height: renderHeight)
 
         let videoResult = try await videoBuilder.build(in: composition)
 
+        // Store track configs for compositor
+        var updatedVideoEffects = videoEffects
+        updatedVideoEffects.videoClipConfigs = videoResult.trackConfigs
+
         // Add custom audio tracks
         var customAudioTracks: [(track: AVMutableCompositionTrack, config: AudioTrackConfig)] = []
         for trackConfig in audioTracks {
@@ -106,9 +120,7 @@ internal class CompositionBuilder {
         )
         let compositionRenderSize = videoResult.renderSize
 
-        // Create instructions for each clip segment
-        // Use custom instruction class to ensure requiredSourceTrackIDs is properly set
-        // This fixes issues on older macOS versions
+        // Create instructions for each non-overlapping time segment
         var instructions: [AVVideoCompositionInstructionProtocol] = []
 
         PluginLog.print("")
@@ -120,6 +132,8 @@ internal class CompositionBuilder {
         PluginLog.print("==========================================")
         PluginLog.print("")
 
+        // Calculate pre-determined transforms for all clips
+        var clipTransforms: [CGAffineTransform] = []
         for (index, clipInstruction) in videoResult.clipInstructions.enumerated() {
             PluginLog.print("🎬 Processing instruction for clip \(index)")
             PluginLog.print(
@@ -133,36 +147,55 @@ internal class CompositionBuilder {
                 with: clipInstruction.transform,
                 clipIndex: index
             )
+            clipTransforms.append(transform)
+        }
 
-            let layerInstruction: AVVideoCompositionLayerInstruction
-            if #available(macOS 26.0, *) {
-                var config = AVVideoCompositionLayerInstruction.Configuration(
-                    assetTrack: videoResult.videoTrack
-                )
-                config.setTransform(transform, at: .zero)
-                layerInstruction = AVVideoCompositionLayerInstruction(configuration: config)
-            } else {
-                let mutableInstruction = AVMutableVideoCompositionLayerInstruction(
-                    assetTrack: videoResult.videoTrack
-                )
-                mutableInstruction.setTransform(transform, at: .zero)
-                layerInstruction = mutableInstruction
-            }
-
-            // Use custom instruction that explicitly provides requiredSourceTrackIDs
-            let instruction = CustomVideoCompositionInstruction(
-                timeRange: clipInstruction.timeRange,
-                sourceTrackID: videoResult.videoTrack.trackID,
-                layerInstructions: [layerInstruction],
-                backgroundColor: CGColor(red: 0, green: 0, blue: 0, alpha: 1)
-            )
+        // Calculate non-overlapping time segments
+        let segments = calculateSegments(
+            from: videoResult.clipInstructions,
+            totalDuration: videoResult.totalDuration
+        )
 
+        for (segIndex, segmentRange) in segments.enumerated() {
+            PluginLog.print("🎬 Processing segment \(segIndex)")
             PluginLog.print(
-                "   ⚙️ Layer instruction configured with transform (trackID: \(videoResult.videoTrack.trackID))"
+                "   Time range: \(String(format: "%.2f", segmentRange.start.seconds))s - \(String(format: "%.2f", (segmentRange.start + segmentRange.duration).seconds))s"
             )
-            PluginLog.print("")
 
-            instructions.append(instruction)
+            var activeTrackIDs: [CMPersistentTrackID] = []
+            var layerInstructions: [AVVideoCompositionLayerInstruction] = []
+
+            for (clipIndex, clipInstruction) in videoResult.clipInstructions.enumerated() {
+                // Check if this clip is active during this segment
+                let clipRange = clipInstruction.timeRange
+                let intersection = CMTimeRangeGetIntersection(segmentRange, otherRange: clipRange)
+
+                if CMTimeGetSeconds(intersection.duration) > 0 {
+                    activeTrackIDs.append(clipInstruction.trackID)
+
+                    let transform = clipTransforms[clipIndex]
+                    let mutableLayerInstruction = AVMutableVideoCompositionLayerInstruction(
+                        assetTrack: composition.track(withTrackID: clipInstruction.trackID)!
+                    )
+                    mutableLayerInstruction.setTransform(transform, at: .zero)
+                    layerInstructions.append(mutableLayerInstruction)
+
+                    PluginLog.print("   - Added trackID \(clipInstruction.trackID) (Clip \(clipIndex))")
+                }
+            }
+
+            if !layerInstructions.isEmpty {
+                // Use custom instruction that explicitly provides requiredSourceTrackIDs
+                let instruction = CustomVideoCompositionInstruction(
+                    timeRange: segmentRange,
+                    sourceTrackIDs: activeTrackIDs,
+                    layerInstructions: layerInstructions,
+                    backgroundColor: CGColor(red: 0, green: 0, blue: 0, alpha: 1)
+                )
+                instructions.append(instruction)
+                PluginLog.print("   ✅ Segment instruction created with \(layerInstructions.count) layers")
+            }
+            PluginLog.print("")
         }
 
         let videoCompositionData = VideoCompositionData(
@@ -173,10 +206,10 @@ internal class CompositionBuilder {
 
         PluginLog.print("✅ Composition created successfully with \(videoClips.count) clips")
 
-        // Return the track ID for fallback on older macOS versions
-        let sourceTrackID = videoResult.videoTrack.trackID
+        // Return the first track ID for fallback on older macOS versions
+        let sourceTrackID = videoResult.clipInstructions.first?.trackID ?? kCMPersistentTrackID_Invalid
 
-        return (composition, videoCompositionData, videoResult.renderSize, audioMix, sourceTrackID)
+        return (composition, videoCompositionData, videoResult.renderSize, audioMix, sourceTrackID, updatedVideoEffects)
     }
 
     /// Creates audio mix with per-clip and per-track volume parameters.
@@ -191,11 +224,17 @@ internal class CompositionBuilder {
         for track in originalTracks {
             let inputParameters = AVMutableAudioMixInputParameters(track: track)
 
-            // Use setVolumeRamp for each clip's time range to ensure
-            // volume changes are applied precisely per segment
-            for (index, clipInstruction) in clipInstructions.enumerated() {
+            // Find all instructions that apply to this specific audio track
+            let relevantInstructions = clipInstructions.enumerated().filter { _, instruction in
+                instruction.audioTrackID == track.trackID
+            }
+
+            for (index, clipInstruction) in relevantInstructions {
                 let clipVolume = index < videoClips.count
                     ? (videoClips[index].volume ?? 1.0) : 1.0
+
+                PluginLog.print("🔊 Setting volume ramp for track \(track.trackID): volume=\(clipVolume) at \(String(format: "%.2f", clipInstruction.timeRange.start.seconds))s")
+
                 inputParameters.setVolumeRamp(
                     fromStartVolume: clipVolume,
                     toEndVolume: clipVolume,
@@ -204,7 +243,7 @@ internal class CompositionBuilder {
             }
 
             audioMixInputParameters.append(inputParameters)
-            PluginLog.print("🔊 Applied per-clip volume to original audio track")
+            PluginLog.print("🔊 Applied per-clip volume to original audio track (ID: \(track.trackID))")
         }
 
         // Apply volume to custom audio tracks
@@ -234,6 +273,16 @@ internal class CompositionBuilder {
         with preferredTransform: CGAffineTransform,
         clipIndex: Int
     ) -> CGAffineTransform {
+        // If manual positioning is requested, use the preferred transform as-is.
+        // The compositor will handle custom scaling and positioning based on VideoClip config.
+        if clipIndex < videoClips.count {
+            let clip = videoClips[clipIndex]
+            if clip.x != nil || clip.y != nil || clip.width != nil || clip.height != nil {
+                PluginLog.print("   🎯 Manual positioning detected for clip \(clipIndex), skipping fit and center transform")
+                return preferredTransform
+            }
+        }
+
         // Get the display size after applying the original transform (handles rotation)
         let displaySize = naturalSize.applying(preferredTransform)
         let videoWidth = abs(displaySize.width)
@@ -315,4 +364,39 @@ internal class CompositionBuilder {
 
         return transform
     }
+
+    /// Calculates non-overlapping time segments from clip instructions.
+    private func calculateSegments(from instructions: [ClipInstruction], totalDuration: CMTime) -> [CMTimeRange] {
+        var points: [CMTime] = [.zero, totalDuration]
+        for instruction in instructions {
+            points.append(instruction.timeRange.start)
+            points.append(CMTimeAdd(instruction.timeRange.start, instruction.timeRange.duration))
+        }
+
+        let sortedPoints = points
+            .filter { CMTimeCompare($0, totalDuration) <= 0 }
+            .sorted { CMTimeCompare($0, $1) < 0 }
+
+        var uniquePoints: [CMTime] = []
+        for point in sortedPoints {
+            if let last = uniquePoints.last {
+                if CMTimeCompare(last, point) != 0 {
+                    uniquePoints.append(point)
+                }
+            } else {
+                uniquePoints.append(point)
+            }
+        }
+
+        var segments: [CMTimeRange] = []
+        for i in 0..<uniquePoints.count - 1 {
+            let start = uniquePoints[i]
+            let end = uniquePoints[i+1]
+            let duration = CMTimeSubtract(end, start)
+            if CMTimeGetSeconds(duration) > 0 {
+                segments.append(CMTimeRange(start: start, duration: duration))
+            }
+        }
+        return segments
+    }
 }
diff --git a/macos/Classes/src/features/render/helpers/VideoSequenceBuilder.swift b/macos/Classes/src/features/render/helpers/VideoSequenceBuilder.swift
index 8fdc973..3cca21b 100644
--- a/macos/Classes/src/features/render/helpers/VideoSequenceBuilder.swift
+++ b/macos/Classes/src/features/render/helpers/VideoSequenceBuilder.swift
@@ -9,6 +9,8 @@ internal class VideoSequenceBuilder {
 
     private let videoClips: [VideoClip]
     private var enableAudio: Bool = true
+    private var renderWidth: Double?
+    private var renderHeight: Double?
 
     /// Initializes builder with video clips.
     ///
@@ -17,6 +19,13 @@ internal class VideoSequenceBuilder {
         self.videoClips = videoClips
     }
 
+    /// Sets the target render size.
+    func setRenderSize(width: Double?, height: Double?) -> VideoSequenceBuilder {
+        self.renderWidth = width
+        self.renderHeight = height
+        return self
+    }
+
     /// Enables or disables audio in the output.
     ///
     /// - Parameter enabled: If true, includes original audio from video clips
@@ -67,7 +76,7 @@ internal class VideoSequenceBuilder {
     /// Builds the video composition with all clips.
     ///
     /// - Parameter composition: Composition to build into
-    /// - Returns: Tuple containing video track, audio tracks, render size, frame rate, and clip instructions
+    /// - Returns: Tuple containing video tracks, audio tracks, render size, frame rate, and clip instructions
     func build(in composition: AVMutableComposition) async throws -> VideoSequenceResult {
         guard !videoClips.isEmpty else {
             throw NSError(
@@ -85,32 +94,7 @@ internal class VideoSequenceBuilder {
         var maxFrameRate: Float = 30.0
         var originalAudioTracks: [AVMutableCompositionTrack] = []
         var clipInstructions: [ClipInstruction] = []
-
-        // Create single video track for all clips
-        guard
-            let compositionVideoTrack = composition.addMutableTrack(
-                withMediaType: .video,
-                preferredTrackID: kCMPersistentTrackID_Invalid
-            )
-        else {
-            throw NSError(
-                domain: "VideoSequenceBuilder",
-                code: 2,
-                userInfo: [NSLocalizedDescriptionKey: "Failed to create video track"]
-            )
-        }
-
-        // Create single shared audio track for all clips (if enabled)
-        var sharedAudioTrack: AVMutableCompositionTrack?
-        if enableAudio {
-            sharedAudioTrack = composition.addMutableTrack(
-                withMediaType: .audio,
-                preferredTrackID: kCMPersistentTrackID_Invalid
-            )
-            if sharedAudioTrack != nil {
-                PluginLog.print("🔊 Created SHARED audio track for all clips (will prevent empty segments)")
-            }
-        }
+        var trackConfigs: [CMPersistentTrackID: VideoClip] = [:]
 
         // Process each video clip
         for (index, clip) in videoClips.enumerated() {
@@ -156,15 +140,12 @@ internal class VideoSequenceBuilder {
             PluginLog.print("   - Display size: \(correctedSize.width) x \(correctedSize.height)")
             PluginLog.print("   - Frame rate: \(nominalFrameRate) fps")
 
-            // Update max render size
-            if correctedSize.width > maxRenderSize.width
-                || correctedSize.height > maxRenderSize.height
-            {
-                let oldSize = maxRenderSize
+            // Update max render size (only if not explicitly provided)
+            if index == 0 && (renderWidth == nil || renderHeight == nil) {
                 maxRenderSize = correctedSize
-                PluginLog.print(
-                    "   - ⬆️ Max render size updated: \(oldSize.width)x\(oldSize.height) → \(maxRenderSize.width)x\(maxRenderSize.height)"
-                )
+                PluginLog.print("   - 📏 Base render size set from first clip: \(maxRenderSize.width)x\(maxRenderSize.height)")
+            } else if renderWidth != nil && renderHeight != nil {
+                maxRenderSize = CGSize(width: renderWidth!, height: renderHeight!)
             }
 
             // Update max frame rate
@@ -176,62 +157,87 @@ internal class VideoSequenceBuilder {
             let clipTimeRange = await calculateTimeRange(for: clip, from: asset)
             let clipDuration = clipTimeRange.duration
 
+            // Determine insertion time in composition
+            let insertionTime: CMTime
+            if let segmentTimeUs = clip.segmentTimeUs {
+                insertionTime = CMTime(value: segmentTimeUs, timescale: 1_000_000)
+            } else {
+                insertionTime = totalDuration
+            }
+
+            // Create a new track for each clip to support overlapping and independent positioning
+            guard let compositionVideoTrack = composition.addMutableTrack(
+                withMediaType: .video,
+                preferredTrackID: kCMPersistentTrackID_Invalid
+            ) else {
+                throw NSError(
+                    domain: "VideoSequenceBuilder",
+                    code: 2,
+                    userInfo: [NSLocalizedDescriptionKey: "Failed to create video track for clip \(index)"]
+                )
+            }
+
             // Insert video clip into the composition track
             try compositionVideoTrack.insertTimeRange(
                 clipTimeRange,
                 of: videoTrack,
-                at: totalDuration
+                at: insertionTime
             )
 
+            // Track mapping for compositor
+            trackConfigs[compositionVideoTrack.trackID] = clip
+
+            // Add audio if enabled
+            var audioTrackID: CMPersistentTrackID? = nil
+            if enableAudio,
+                let audioTrack = try? await MediaInfoExtractor.loadAudioTrack(from: asset)
+            {
+                if let compositionAudioTrack = composition.addMutableTrack(
+                    withMediaType: .audio,
+                    preferredTrackID: kCMPersistentTrackID_Invalid
+                ) {
+                    do {
+                        try compositionAudioTrack.insertTimeRange(
+                            clipTimeRange,
+                            of: audioTrack,
+                            at: insertionTime
+                        )
+                        originalAudioTracks.append(compositionAudioTrack)
+                        audioTrackID = compositionAudioTrack.trackID
+                        PluginLog.print("   🔊 Audio inserted into its own track (ID: \(audioTrackID!))")
+                    } catch {
+                        PluginLog.print("   ❌ ERROR inserting audio: \(error.localizedDescription)")
+                    }
+                } else {
+                    PluginLog.print("   ⚠️ WARNING: Failed to create audio track for clip \(index)")
+                }
+            }
+
             // Store instruction for this clip segment
             clipInstructions.append(
                 ClipInstruction(
-                    timeRange: CMTimeRange(start: totalDuration, duration: clipDuration),
+                    timeRange: CMTimeRange(start: insertionTime, duration: clipDuration),
                     transform: preferredTransform,
                     naturalSize: naturalSize,
-                    renderSize: correctedSize
+                    renderSize: correctedSize,
+                    trackID: compositionVideoTrack.trackID,
+                    audioTrackID: audioTrackID
                 ))
 
-            // Add audio to shared track if enabled
-            if enableAudio,
-                let audioTrack = try? await MediaInfoExtractor.loadAudioTrack(from: asset),
-                let sharedAudioTrack = sharedAudioTrack
-            {
-                PluginLog.print("🔊 Processing audio for clip \(index)...")
-                PluginLog.print("   ✅ Audio track loaded from asset")
-                PluginLog.print("      Track ID: \(audioTrack.trackID)")
-                PluginLog.print(
-                    "      Duration: \(String(format: "%.2f", audioTrack.timeRange.duration.seconds))s"
-                )
-                PluginLog.print("      Format: \(audioTrack.mediaType)")
-
-                do {
-                    try sharedAudioTrack.insertTimeRange(
-                        clipTimeRange,
-                        of: audioTrack,
-                        at: totalDuration
-                    )
-                    PluginLog.print("   ✅ Audio inserted into SHARED track!")
-                    PluginLog.print(
-                        "      Source time range: \(String(format: "%.2f", clipTimeRange.start.seconds))s - \(String(format: "%.2f", (clipTimeRange.start + clipTimeRange.duration).seconds))s"
-                    )
-                    PluginLog.print(
-                        "      Inserted at composition time: \(String(format: "%.2f", totalDuration.seconds))s"
-                    )
-                    PluginLog.print(
-                        "      Audio duration: \(String(format: "%.2f", clipTimeRange.duration.seconds))s"
-                    )
-                } catch {
-                    PluginLog.print("   ❌ ERROR inserting audio: \(error.localizedDescription)")
-                    PluginLog.print("      Error details: \(error)")
+            // Update total duration (sequential part)
+            if clip.segmentTimeUs == nil {
+                totalDuration = CMTimeAdd(totalDuration, clipDuration)
+            } else {
+                let endInComposition = CMTimeAdd(insertionTime, clipDuration)
+                if CMTimeCompare(endInComposition, totalDuration) > 0 {
+                    totalDuration = endInComposition
                 }
             }
 
-            totalDuration = CMTimeAdd(totalDuration, clipDuration)
             PluginLog.print("✅ Clip \(index) added successfully")
             PluginLog.print("   - Duration: \(String(format: "%.2f", clipDuration.seconds))s")
             PluginLog.print(
-                "   - Time range in composition: \(String(format: "%.2f", totalDuration.seconds - clipDuration.seconds))s - \(String(format: "%.2f", totalDuration.seconds))s"
+                "   - Time range in composition: \(String(format: "%.2f", insertionTime.seconds))s - \(String(format: "%.2f", CMTimeAdd(insertionTime, clipDuration).seconds))s"
             )
         }
 
@@ -239,42 +245,20 @@ internal class VideoSequenceBuilder {
         PluginLog.print("📊 ===== VIDEO SEQUENCE SUMMARY =====")
         PluginLog.print("   Total clips: \(videoClips.count)")
         PluginLog.print("   Total duration: \(String(format: "%.2f", totalDuration.seconds))s")
-        PluginLog.print("   Max render size: \(maxRenderSize.width) x \(maxRenderSize.height)")
+        PluginLog.print("   Render size: \(maxRenderSize.width) x \(maxRenderSize.height)")
         PluginLog.print("   Max frame rate: \(maxFrameRate) fps")
         PluginLog.print("   Clip instructions: \(clipInstructions.count)")
-
-        // Handle shared audio track - add to result if it has segments, otherwise remove from composition
-        if let audioTrack = sharedAudioTrack {
-            if !audioTrack.segments.isEmpty {
-                originalAudioTracks.append(audioTrack)
-                PluginLog.print(
-                    "   🔊 AUDIO TRACKS: 1 (shared track with \(audioTrack.segments.count) segment(s))"
-                )
-                for (segIdx, segment) in audioTrack.segments.enumerated() {
-                    let timeMapping = segment as AVCompositionTrackSegment
-                    PluginLog.print(
-                        "      Segment \(segIdx): \(String(format: "%.2f", timeMapping.timeMapping.target.start.seconds))s - \(String(format: "%.2f", (timeMapping.timeMapping.target.start + timeMapping.timeMapping.target.duration).seconds))s (duration: \(String(format: "%.2f", timeMapping.timeMapping.target.duration.seconds))s)"
-                    )
-                }
-            } else {
-                // Remove empty audio track from composition to prevent export errors
-                composition.removeTrack(audioTrack)
-                PluginLog.print("   🔊 AUDIO TRACKS: 0 (shared track was empty and removed from composition)")
-            }
-        } else {
-            PluginLog.print("   🔊 AUDIO TRACKS: 0 (no audio track created)")
-        }
-
+        PluginLog.print("   Audio tracks: \(originalAudioTracks.count)")
         PluginLog.print("=====================================")
         PluginLog.print("")
 
         return VideoSequenceResult(
-            videoTrack: compositionVideoTrack,
             audioTracks: originalAudioTracks,
             totalDuration: totalDuration,
             renderSize: maxRenderSize,
             frameRate: maxFrameRate,
-            clipInstructions: clipInstructions
+            clipInstructions: clipInstructions,
+            trackConfigs: trackConfigs
         )
     }
 
@@ -312,16 +296,18 @@ internal struct ClipInstruction {
     let transform: CGAffineTransform
     let naturalSize: CGSize
     let renderSize: CGSize
+    let trackID: CMPersistentTrackID
+    let audioTrackID: CMPersistentTrackID?
 }
 
 /// Result of building a video sequence.
 internal struct VideoSequenceResult {
-    let videoTrack: AVMutableCompositionTrack
     let audioTracks: [AVMutableCompositionTrack]
     let totalDuration: CMTime
     let renderSize: CGSize
     let frameRate: Float
     let clipInstructions: [ClipInstruction]
+    let trackConfigs: [CMPersistentTrackID: VideoClip]
 }
 
 /// Holds the data needed to construct an AVMutableVideoComposition without
@@ -356,12 +342,12 @@ internal class CustomVideoCompositionInstruction: NSObject, AVVideoCompositionIn
 
     init(
         timeRange: CMTimeRange,
-        sourceTrackID: CMPersistentTrackID,
+        sourceTrackIDs: [CMPersistentTrackID],
         layerInstructions: [AVVideoCompositionLayerInstruction],
         backgroundColor: CGColor? = nil
     ) {
         self.timeRange = timeRange
-        self._requiredSourceTrackIDs = [NSNumber(value: sourceTrackID)]
+        self._requiredSourceTrackIDs = sourceTrackIDs.map { NSNumber(value: $0) }
         self.layerInstructions = layerInstructions
         self.backgroundColor = backgroundColor
         super.init()
diff --git a/macos/Classes/src/features/render/models/RenderConfig.swift b/macos/Classes/src/features/render/models/RenderConfig.swift
index 36ede4a..722e981 100644
--- a/macos/Classes/src/features/render/models/RenderConfig.swift
+++ b/macos/Classes/src/features/render/models/RenderConfig.swift
@@ -212,6 +212,12 @@ struct RenderConfig {
     /// Global end time in microseconds for trimming the final composition
     let endUs: Int64?
 
+    /// Target render width
+    let renderWidth: Double?
+
+    /// Target render height
+    let renderHeight: Double?
+
     /// Whether to optimize the video for network streaming (fast start).
     /// When true, moves the moov atom to the beginning of the file.
     let shouldOptimizeForNetworkUse: Bool
@@ -248,6 +254,8 @@ struct RenderConfig {
             blur: self.blur,
             startUs: self.startUs,
             endUs: self.endUs,
+            renderWidth: self.renderWidth,
+            renderHeight: self.renderHeight,
             shouldOptimizeForNetworkUse: self.shouldOptimizeForNetworkUse,
             imageBytesWithCropping: self.imageBytesWithCropping
         )
@@ -269,7 +277,14 @@ struct RenderConfig {
                     inputPath: inputPath,
                     startUs: (clipMap["startUs"] as? NSNumber)?.int64Value,
                     endUs: (clipMap["endUs"] as? NSNumber)?.int64Value,
-                    volume: (clipMap["volume"] as? NSNumber)?.floatValue
+                    volume: (clipMap["volume"] as? NSNumber)?.floatValue,
+                    opacity: (clipMap["opacity"] as? NSNumber)?.doubleValue,
+                    x: (clipMap["x"] as? NSNumber)?.doubleValue,
+                    y: (clipMap["y"] as? NSNumber)?.doubleValue,
+                    width: (clipMap["width"] as? NSNumber)?.doubleValue,
+                    height: (clipMap["height"] as? NSNumber)?.doubleValue,
+                    segmentTimeUs: (clipMap["segmentTimeUs"] as? NSNumber)?.int64Value,
+                    zIndex: clipMap["zIndex"] as? Int
                 )
             }
         }
@@ -320,6 +335,8 @@ struct RenderConfig {
             blur: (args["blur"] as? NSNumber)?.doubleValue,
             startUs: (args["startUs"] as? NSNumber)?.int64Value,
             endUs: (args["endUs"] as? NSNumber)?.int64Value,
+            renderWidth: (args["renderWidth"] as? NSNumber)?.doubleValue,
+            renderHeight: (args["renderHeight"] as? NSNumber)?.doubleValue,
             shouldOptimizeForNetworkUse: args["shouldOptimizeForNetworkUse"] as? Bool ?? true,
             imageBytesWithCropping: args["imageBytesWithCropping"] as? Bool ?? false
         )
diff --git a/macos/Classes/src/features/render/models/VideoClip.swift b/macos/Classes/src/features/render/models/VideoClip.swift
index 6241247..dd59540 100644
--- a/macos/Classes/src/features/render/models/VideoClip.swift
+++ b/macos/Classes/src/features/render/models/VideoClip.swift
@@ -1,16 +1,44 @@
 import Foundation
 
-/// Represents a video clip with optional trimming and volume control
+/// Represents a video clip with optional trimming, volume control, and positioning
 internal struct VideoClip {
     let inputPath: String
     let startUs: Int64?
     let endUs: Int64?
     let volume: Float?
+    let opacity: Double?
 
-    init(inputPath: String, startUs: Int64? = nil, endUs: Int64? = nil, volume: Float? = nil) {
+    // New fields for composition support
+    let x: Double?
+    let y: Double?
+    let width: Double?
+    let height: Double?
+    let segmentTimeUs: Int64?
+    let zIndex: Int?
+
+    init(
+        inputPath: String,
+        startUs: Int64? = nil,
+        endUs: Int64? = nil,
+        volume: Float? = nil,
+        opacity: Double? = nil,
+        x: Double? = nil,
+        y: Double? = nil,
+        width: Double? = nil,
+        height: Double? = nil,
+        segmentTimeUs: Int64? = nil,
+        zIndex: Int? = nil
+    ) {
         self.inputPath = inputPath
         self.startUs = startUs
         self.endUs = endUs
         self.volume = volume
+        self.opacity = opacity
+        self.x = x
+        self.y = y
+        self.width = width
+        self.height = height
+        self.segmentTimeUs = segmentTimeUs
+        self.zIndex = zIndex
     }
 }
diff --git a/macos/Classes/src/features/render/models/VideoCompositorConfig.swift b/macos/Classes/src/features/render/models/VideoCompositorConfig.swift
index 5d01bbb..6e210c9 100644
--- a/macos/Classes/src/features/render/models/VideoCompositorConfig.swift
+++ b/macos/Classes/src/features/render/models/VideoCompositorConfig.swift
@@ -35,4 +35,12 @@ struct VideoCompositorConfig {
     /// Fallback source track ID for older macOS versions where sourceTrackIDs may be empty.
     /// This is used when the custom compositor doesn't receive track IDs properly.
     var sourceTrackID: CMPersistentTrackID = kCMPersistentTrackID_Invalid
+
+    /// Mapping of track ID to video clip configuration for multi-track compositing
+    var videoClipConfigs: [CMPersistentTrackID: VideoClip] = [:]
+
+    /// The intended render size of the composition (logical coordinate space).
+    /// This is used to calculate scale factors if the actual render context size
+    /// differs from the intended size (e.g. due to AVAssetExportSession presets).
+    var intendedRenderSize: CGSize = .zero
 }
diff --git a/macos/Classes/src/features/render/utils/VideoCompositor.swift b/macos/Classes/src/features/render/utils/VideoCompositor.swift
index e6c2862..6d5530f 100644
--- a/macos/Classes/src/features/render/utils/VideoCompositor.swift
+++ b/macos/Classes/src/features/render/utils/VideoCompositor.swift
@@ -35,10 +35,14 @@ class VideoCompositor: NSObject, AVVideoCompositing {
     var cropHeight: CGFloat?
 
     var originalNaturalSize: CGSize = .zero
+    var intendedRenderSize: CGSize = .zero
 
     /// Fallback source track ID for older macOS versions
     var sourceTrackID: CMPersistentTrackID = kCMPersistentTrackID_Invalid
 
+    /// Track configurations for multi-track compositing
+    var videoClipConfigs: [CMPersistentTrackID: VideoClip] = [:]
+
     /// Color filter configs for per-frame LUT computation
     private var colorFilterConfigs: [ColorFilterConfig] = []
 
@@ -75,7 +79,9 @@ class VideoCompositor: NSObject, AVVideoCompositing {
         self.videoRotationDegrees = config.videoRotationDegrees
         self.shouldApplyOrientationCorrection = config.shouldApplyOrientationCorrection
         self.originalNaturalSize = config.originalNaturalSize
+        self.intendedRenderSize = config.intendedRenderSize
         self.sourceTrackID = config.sourceTrackID
+        self.videoClipConfigs = config.videoClipConfigs
 
         self.setOverlayImageLayers(from: config.imageLayerConfigs)
         self.colorFilterConfigs = config.colorFilterConfigs
@@ -176,313 +182,186 @@ class VideoCompositor: NSObject, AVVideoCompositing {
     func renderContextChanged(_ newRenderContext: AVVideoCompositionRenderContext) {}
 
     func startRequest(_ request: AVAsynchronousVideoCompositionRequest) {
-        // Try to get source buffer from the first available track
-        var sourceBuffer: CVPixelBuffer?
-
-        if !request.sourceTrackIDs.isEmpty {
-            sourceBuffer = request.sourceFrame(byTrackID: request.sourceTrackIDs[0].int32Value)
-        }
-
-        // Fallback 1: Try to get track ID from layer instruction if sourceTrackIDs is empty
-        // This can happen on older macOS versions
-        if sourceBuffer == nil,
-            let instruction = request.videoCompositionInstruction
-                as? CustomVideoCompositionInstruction,
-            let layerInstruction = instruction.layerInstructions.first
-        {
-            let trackID = layerInstruction.trackID
-            if trackID != kCMPersistentTrackID_Invalid {
-                sourceBuffer = request.sourceFrame(byTrackID: trackID)
-            }
-        }
-
-        // Fallback 2: Use the pre-configured sourceTrackID from VideoCompositorConfig
-        // This is set during composition building and guarantees we have the correct track ID
-        if sourceBuffer == nil && sourceTrackID != kCMPersistentTrackID_Invalid {
-            sourceBuffer = request.sourceFrame(byTrackID: sourceTrackID)
-        }
-
-        guard let sourceBuffer = sourceBuffer else {
-            request.finish(
-                with: NSError(
-                    domain: "VideoCompositor", code: 0,
-                    userInfo: [
-                        NSLocalizedDescriptionKey:
-                            "No source tracks available for compositing (sourceTrackIDs: \(request.sourceTrackIDs.count), configTrackID: \(sourceTrackID))"
-                    ]))
-            return
-        }
-        var outputImage = CIImage(cvPixelBuffer: sourceBuffer)
-
-        // Apply layer instruction transform first (video scaling/centering/rotation)
-        // This ensures all videos are properly sized and oriented before applying user effects.
-        // The layerInstruction contains the preferredTransform which already handles video rotation
-        // from portrait to landscape or vice versa, so no additional orientation correction is needed.
-        //
-        // IMPORTANT: AVFoundation uses a top-left origin coordinate system (Y points down),
-        // while CIImage uses a bottom-left origin (Y points up). We need to convert the transform
-        // to work correctly with CIImage's coordinate system.
-
-        // Extract layer instruction from CustomVideoCompositionInstruction
-        var layerInstruction: AVVideoCompositionLayerInstruction?
-        if let customInstruction = request.videoCompositionInstruction
-            as? CustomVideoCompositionInstruction,
-            let firstLayerInstruction = customInstruction.layerInstructions.first
-        {
-            layerInstruction = firstLayerInstruction
-        }
-
-        if let layerInstruction = layerInstruction {
-            var startTransform = CGAffineTransform.identity
-            var endTransform = CGAffineTransform.identity
-            var timeRange = CMTimeRange.zero
-
-            // Get the transform at the current composition time
-            let hasTransform = layerInstruction.getTransformRamp(
-                for: request.compositionTime,
-                start: &startTransform,
-                end: &endTransform,
-                timeRange: &timeRange
-            )
-
-            if hasTransform && !startTransform.isIdentity {
-                // Convert AVFoundation transform to CIImage coordinate system:
-                // 1. Flip Y axis before transform (go from CIImage coords to AVFoundation coords)
-                // 2. Apply the AVFoundation transform
-                // 3. Flip Y axis after transform (go back to CIImage coords)
-                let imageHeight = outputImage.extent.height
-
-                // Flip Y: translate to top, scale Y by -1
-                let flipY = CGAffineTransform(scaleX: 1, y: -1)
-                    .translatedBy(x: 0, y: -imageHeight)
-
-                // Convert transform: flipY * transform * flipY^-1
-                // But since flipY is its own inverse (when combined with translate), we use:
-                // result = flipY * transform * flipY (adjusted for new height after transform)
-                let convertedTransform =
-                    flipY
-                    .concatenating(startTransform)
-
-                outputImage = outputImage.transformed(by: convertedTransform)
-
-                // After transform, we need to flip back and normalize
-                let transformedExtent = outputImage.extent
-                let newHeight = transformedExtent.height
-                let flipBack = CGAffineTransform(scaleX: 1, y: -1)
-                    .translatedBy(x: 0, y: -newHeight)
-
-                outputImage = outputImage.transformed(by: flipBack)
-
-                // Normalize position to origin
-                let finalExtent = outputImage.extent
-                if finalExtent.origin.x != 0 || finalExtent.origin.y != 0 {
-                    let translation = CGAffineTransform(
-                        translationX: -finalExtent.origin.x,
-                        y: -finalExtent.origin.y
-                    )
-                    outputImage = outputImage.transformed(by: translation)
+        let renderSize = request.renderContext.size
+        let currentTimeUs = Int64(CMTimeGetSeconds(request.compositionTime) * 1_000_000)
+
+        // Calculate scale factors between intended logical resolution and actual render size.
+        // This handles cases where AVAssetExportSession forces a different resolution
+        // (e.g. 1080p preset for a 4K composition).
+        let scaleFactorX = intendedRenderSize.width > 0 ? renderSize.width / intendedRenderSize.width : 1.0
+        let scaleFactorY = intendedRenderSize.height > 0 ? renderSize.height / intendedRenderSize.height : 1.0
+
+        // 1. Define a common structure for all renderable items
+        enum RenderableItem {
+            case video(image: CIImage, clip: VideoClip, trackID: CMPersistentTrackID)
+            case imageLayer(layer: ImageLayer)
+
+            var zIndex: Int {
+                switch self {
+                    case .video(_, let clip, _): return clip.zIndex ?? 0
+                    case .imageLayer: return Int.max
                 }
             }
         }
 
-        var center = CGPoint(x: outputImage.extent.midX, y: outputImage.extent.midY)
-
-        // Apply user-defined effects (crop, rotation, flip, scale)
-        var transform = CGAffineTransform.identity
-
-        // Apply LUT, blur, and flip BEFORE overlay when imageBytesWithCropping is enabled
-        // This ensures these effects only affect the video, not the overlay
-        if imageBytesWithCropping {
-            // Apply color filter (timed LUT) to video only
-            outputImage = applyColorFilter(to: outputImage, at: request.compositionTime)
+        var items: [RenderableItem] = []
+
+        // 2. Collect active video frames
+        for trackIDValue in request.sourceTrackIDs {
+            let trackID = trackIDValue.int32Value
+            if let sourceBuffer = request.sourceFrame(byTrackID: trackID),
+               let clipConfig = videoClipConfigs[trackID] {
+
+                var frameImage = CIImage(cvPixelBuffer: sourceBuffer)
+
+                // Apply individual track transform from layer instructions
+                if let customInstruction = request.videoCompositionInstruction as? CustomVideoCompositionInstruction {
+                    for layerInstruction in customInstruction.layerInstructions {
+                        if layerInstruction.trackID == trackID {
+                            var startTransform = CGAffineTransform.identity
+                            var endTransform = CGAffineTransform.identity
+                            var timeRange = CMTimeRange.zero
+
+                            let hasTransform = layerInstruction.getTransformRamp(
+                                for: request.compositionTime,
+                                start: &startTransform,
+                                end: &endTransform,
+                                timeRange: &timeRange
+                            )
+
+                            if hasTransform && !startTransform.isIdentity {
+                                let imageHeight = frameImage.extent.height
+                                let flipY = CGAffineTransform(scaleX: 1, y: -1).translatedBy(x: 0, y: -imageHeight)
+                                let convertedTransform = flipY.concatenating(startTransform)
+                                frameImage = frameImage.transformed(by: convertedTransform)
+
+                                let transformedExtent = frameImage.extent
+                                let flipBack = CGAffineTransform(scaleX: 1, y: -1).translatedBy(x: 0, y: -transformedExtent.height)
+                                frameImage = frameImage.transformed(by: flipBack)
+
+                                // Normalize
+                                let finalExtent = frameImage.extent
+                                if finalExtent.origin.x != 0 || finalExtent.origin.y != 0 {
+                                    frameImage = frameImage.transformed(by: CGAffineTransform(translationX: -finalExtent.origin.x, y: -finalExtent.origin.y))
+                                }
+                            }
+                            break
+                        }
+                    }
+                }
 
-            // Apply blur to video only
-            if blurSigma > 0 {
-                outputImage = outputImage.applyingGaussianBlur(sigma: blurSigma)
+                items.append(.video(image: frameImage, clip: clipConfig, trackID: trackID))
             }
+        }
 
-            // Apply flip to video only (before adding overlay)
-            if flipX || flipY {
-                let flipScaleX: CGFloat = flipX ? -1 : 1
-                let flipScaleY: CGFloat = flipY ? -1 : 1
-
-                let flipTransform = CGAffineTransform(translationX: center.x, y: center.y)
-                    .scaledBy(x: flipScaleX, y: flipScaleY)
-                    .translatedBy(x: -center.x, y: -center.y)
-
-                outputImage = outputImage.transformed(by: flipTransform)
-
-                // Normalize position after flip
-                let flippedExtent = outputImage.extent
-                if flippedExtent.origin.x != 0 || flippedExtent.origin.y != 0 {
-                    let translation = CGAffineTransform(
-                        translationX: -flippedExtent.origin.x,
-                        y: -flippedExtent.origin.y
-                    )
-                    outputImage = outputImage.transformed(by: translation)
-                }
-                center = CGPoint(x: outputImage.extent.midX, y: outputImage.extent.midY)
+        // 3. Collect active image layers
+        for layer in overlayImageLayers {
+            let inRange = (layer.startUs == -1 || currentTimeUs >= layer.startUs) && (layer.endUs == -1 || currentTimeUs <= layer.endUs)
+            if inRange {
+                items.append(.imageLayer(layer: layer))
             }
         }
 
-        // Apply overlay BEFORE crop if imageBytesWithCropping is enabled
-        if imageBytesWithCropping {
-            let imageRect = outputImage.extent
-
-            // Apply time-based overlay layers
-            let currentTimeUs = Int64(CMTimeGetSeconds(request.compositionTime) * 1_000_000)
-            for layer in overlayImageLayers {
-                let inTimeRange =
-                    (layer.startUs == -1 || currentTimeUs >= layer.startUs)
-                    && (layer.endUs == -1 || currentTimeUs <= layer.endUs)
-
-                if inTimeRange {
-                    var img = layer.image
+        if items.isEmpty {
+            PluginLog.print("⚠️ VideoCompositor: No active items found at time \(request.compositionTime.seconds)s")
+            request.finish(with: NSError(domain: "VideoCompositor", code: 0, userInfo: [NSLocalizedDescriptionKey: "No active items found"]))
+            return
+        }
 
-                    if let w = layer.width, let h = layer.height {
-                        let sx = CGFloat(w) / img.extent.width
-                        let sy = CGFloat(h) / img.extent.height
-                        img = img.transformed(by: CGAffineTransform(scaleX: sx, y: sy))
-                    }
+        // 4. Sort all items by zIndex
+        let sortedItems = items.sorted { $0.zIndex < $1.zIndex }
+
+        // 5. Initialize background image (black frame)
+        var outputImage = CIImage(color: .black).cropped(to: CGRect(origin: .zero, size: renderSize))
+
+        // 6. Composite each item
+        for item in sortedItems {
+            switch item {
+            case .video(let img, let clip, _):
+                var frameImg = img
+
+                // Apply custom size if provided, otherwise scale by global factor
+                if let w = clip.width, let h = clip.height {
+                    let targetW = CGFloat(w) * scaleFactorX
+                    let targetH = CGFloat(h) * scaleFactorY
+                    let sx = targetW / frameImg.extent.width
+                    let sy = targetH / frameImg.extent.height
+                    frameImg = frameImg.transformed(by: CGAffineTransform(scaleX: sx, y: sy))
+                } else if scaleFactorX != 1.0 || scaleFactorY != 1.0 {
+                    frameImg = frameImg.transformed(by: CGAffineTransform(scaleX: scaleFactorX, y: scaleFactorY))
+                }
 
-                    let overlay: CIImage
-                    if layer.x == nil && layer.y == nil {
-                        overlay = img.transformed(
-                            by: CGAffineTransform(
-                                scaleX: imageRect.width / img.extent.width,
-                                y: imageRect.height / img.extent.height))
-                    } else {
-                        let posX = CGFloat(layer.x ?? 0)
-                        let posY = CGFloat(layer.y ?? 0)
-                        let cgY = imageRect.height - posY - img.extent.height
-                        overlay = img.transformed(
-                            by: CGAffineTransform(translationX: posX, y: cgY))
+                // Apply custom offset if provided
+                if clip.x != nil || clip.y != nil {
+                    let posX = CGFloat(clip.x ?? 0) * scaleFactorX
+                    let posY = CGFloat(clip.y ?? 0) * scaleFactorY
+                    // Convert from top-left (Flutter) to bottom-left (Core Image)
+                    let cgY = renderSize.height - posY - frameImg.extent.height
+                    frameImg = frameImg.transformed(by: CGAffineTransform(translationX: posX, y: cgY))
+                } else if scaleFactorX != 1.0 || scaleFactorY != 1.0 {
+                    // Normalize position if we scaled but didn't translate manually
+                    let extent = frameImg.extent
+                    if extent.origin.x != 0 || extent.origin.y != 0 {
+                        frameImg = frameImg.transformed(by: CGAffineTransform(translationX: -extent.origin.x, y: -extent.origin.y))
                     }
-
-                    let (opacity, animTransform) = computeAnimation(
-                        layer: layer,
-                        currentTimeUs: currentTimeUs,
-                        overlayExtent: overlay.extent,
-                        frameExtent: imageRect
-                    )
-                    outputImage = compositeOverlay(
-                        overlay, over: outputImage, opacity: opacity, transform: animTransform)
                 }
-            }
-        }
 
-        // Cropping
-        if cropX != 0 || cropY != 0 || cropWidth != nil || cropHeight != nil {
-            let inputExtent = outputImage.extent
-            let videoWidth = inputExtent.width
-            let videoHeight = inputExtent.height
-
-            let x = cropX
-            var y = cropY
-            let width = cropWidth ?? (videoWidth - x)
-            let height = cropHeight ?? (videoHeight - y)
+                // Apply opacity if needed
+                if let opacity = clip.opacity, opacity < 1.0 {
+                    frameImg = frameImg.applyingFilter("CIColorMatrix", parameters: [
+                        "inputAVector": CIVector(x: 0, y: 0, z: 0, w: CGFloat(opacity)),
+                    ])
+                }
 
-            y = videoHeight - height - y
+                outputImage = frameImg.composited(over: outputImage)
 
-            let cropRect = CGRect(x: x, y: y, width: width, height: height)
+            case .imageLayer(let layer):
+                var layerImg = layer.image
+                if let w = layer.width, let h = layer.height {
+                    let targetW = CGFloat(w) * scaleFactorX
+                    let targetH = CGFloat(h) * scaleFactorY
+                    layerImg = layerImg.transformed(by: CGAffineTransform(scaleX: targetW/layerImg.extent.width, y: targetH/layerImg.extent.height))
+                }
 
-            outputImage = outputImage.cropped(to: cropRect)
-            outputImage = outputImage.transformed(
-                by: CGAffineTransform(
-                    translationX: -cropRect.origin.x,
-                    y: -cropRect.origin.y
+                let overlay: CIImage
+                if layer.x == nil && layer.y == nil {
+                    overlay = layerImg.transformed(by: CGAffineTransform(scaleX: renderSize.width/layerImg.extent.width, y: renderSize.height/layerImg.extent.height))
+                } else {
+                    let posX = CGFloat(layer.x ?? 0) * scaleFactorX
+                    let posY = CGFloat(layer.y ?? 0) * scaleFactorY
+                    let cgY = renderSize.height - posY - layerImg.extent.height
+                    overlay = layerImg.transformed(by: CGAffineTransform(translationX: posX, y: cgY))
+                }
 
-                ))
-            center = CGPoint(x: outputImage.extent.midX, y: outputImage.extent.midY)
+                let (opacity, animTransform) = computeAnimation(layer: layer, currentTimeUs: currentTimeUs, overlayExtent: overlay.extent, frameExtent: CGRect(origin: .zero, size: renderSize))
+                outputImage = compositeOverlay(overlay, over: outputImage, opacity: opacity, transform: animTransform)
+            }
         }
 
-        // Rotation
-        if rotateRadians != 0 {
-            // Rotate the image
-            let rotation = CGAffineTransform(rotationAngle: rotateRadians)
-            let rotatedImage = outputImage.transformed(by: rotation)
-
-            // Get the new bounding box after rotation
-            let rotatedExtent = rotatedImage.extent
-
-            // Translate to (0, 0)
-            let translation = CGAffineTransform(
-                translationX: -rotatedExtent.origin.x, y: -rotatedExtent.origin.y)
-            outputImage = rotatedImage.transformed(by: translation)
-            center = CGPoint(x: outputImage.extent.midX, y: outputImage.extent.midY)
-        }
+        // 7. Apply global effects (if any)
+        let center = CGPoint(x: outputImage.extent.midX, y: outputImage.extent.midY)
+        var transform = CGAffineTransform.identity
 
-        // Flipping (only if NOT imageBytesWithCropping - otherwise already applied before overlay)
-        if !imageBytesWithCropping && (flipX || flipY) {
+        // Apply flip (Global)
+        if flipX || flipY {
             let scaleX: CGFloat = flipX ? -1 : 1
             let scaleY: CGFloat = flipY ? -1 : 1
-
-            let flipTransform = CGAffineTransform(translationX: center.x, y: center.y)
+            transform = transform.concatenating(CGAffineTransform(translationX: center.x, y: center.y)
                 .scaledBy(x: scaleX, y: scaleY)
-                .translatedBy(x: -center.x, y: -center.y)
-
-            transform = transform.concatenating(flipTransform)
+                .translatedBy(x: -center.x, y: -center.y))
         }
 
-        // Apply Scale
+        // Apply Global Scale
         if scaleX != 1 || scaleY != 1 {
             transform = transform.scaledBy(x: scaleX, y: scaleY)
         }
 
         outputImage = outputImage.transformed(by: transform)
 
-        // Apply color filter (only if NOT imageBytesWithCropping - otherwise already applied before overlay)
-        if !imageBytesWithCropping {
-            outputImage = applyColorFilter(to: outputImage, at: request.compositionTime)
-
-            // Apply blur
-            if blurSigma > 0 {
-                outputImage = outputImage.applyingGaussianBlur(sigma: blurSigma)
-            }
-        }
-
-        // Apply overlay image layers (only if not already applied before crop)
-        if !imageBytesWithCropping {
-            let imageRect = outputImage.extent
-
-            let currentTimeUs = Int64(CMTimeGetSeconds(request.compositionTime) * 1_000_000)
-            for layer in overlayImageLayers {
-                let inTimeRange =
-                    (layer.startUs == -1 || currentTimeUs >= layer.startUs)
-                    && (layer.endUs == -1 || currentTimeUs <= layer.endUs)
-                if inTimeRange {
-                    var img = layer.image
-
-                    if let w = layer.width, let h = layer.height {
-                        let sx = CGFloat(w) / img.extent.width
-                        let sy = CGFloat(h) / img.extent.height
-                        img = img.transformed(by: CGAffineTransform(scaleX: sx, y: sy))
-                    }
-
-                    let overlay: CIImage
-                    if layer.x == nil && layer.y == nil {
-                        overlay = img.transformed(
-                            by: CGAffineTransform(
-                                scaleX: imageRect.width / img.extent.width,
-                                y: imageRect.height / img.extent.height))
-                    } else {
-                        let posX = CGFloat(layer.x ?? 0)
-                        let posY = CGFloat(layer.y ?? 0)
-                        let cgY = imageRect.height - posY - img.extent.height
-                        overlay = img.transformed(
-                            by: CGAffineTransform(translationX: posX, y: cgY))
-                    }
-
-                    let (opacity, animTransform) = computeAnimation(
-                        layer: layer,
-                        currentTimeUs: currentTimeUs,
-                        overlayExtent: overlay.extent,
-                        frameExtent: imageRect
-                    )
-                    outputImage = compositeOverlay(
-                        overlay, over: outputImage, opacity: opacity, transform: animTransform)
-                }
-            }
+        // Apply LUT and Blur (Global)
+        outputImage = applyColorFilter(to: outputImage, at: request.compositionTime)
+        if blurSigma > 0 {
+            outputImage = outputImage.applyingGaussianBlur(sigma: blurSigma)
         }
 
         guard let outputBuffer = request.renderContext.newPixelBuffer() else {