From f317814816383f86bfd647d2a6d7c31b2c9ed430 Mon Sep 17 00:00:00 2001
From: Edgar Romo Montiel <edgar.romo.montiel@intel.com>
Date: Wed, 20 May 2026 16:36:05 -0700
Subject: [PATCH 1/3] Add XPU encoding support to Encoder

Signed-off-by: Edgar Romo Montiel <edgar.romo.montiel@intel.com>
---
 ...-Add-XPU-encoding-support-to-Encoder.patch |  81 ++++++
 .../torchcodec_xpu/ColorConversionKernel.cpp  | 143 ++++++++++
 .../torchcodec_xpu/ColorConversionKernel.h    |  18 ++
 .../src/torchcodec_xpu/XpuDeviceInterface.cpp | 258 ++++++++++++++++++
 .../src/torchcodec_xpu/XpuDeviceInterface.h   |  23 ++
 5 files changed, 523 insertions(+)
 create mode 100644 packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch

diff --git a/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch b/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch
new file mode 100644
index 0000000..65be795
--- /dev/null
+++ b/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch
@@ -0,0 +1,81 @@
+diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
+index 2fe136fb..d024834b 100644
+--- a/src/torchcodec/_core/Encoder.cpp
++++ b/src/torchcodec/_core/Encoder.cpp
+@@ -787,7 +787,8 @@ void VideoEncoder::initializeEncoder(
+   // The default CUDA interface is decode-only; encoders need the FFmpeg-based
+   // one.
+   deviceInterface_ = createDeviceInterface(
+-      stableDevice, stableDevice.type() == kStableCUDA ? "ffmpeg" : "default");
++      stableDevice,
++      stableDevice.type() == kStableCUDA ? "ffmpeg" : "default");
+   const AVCodec* avCodec = nullptr;
+   // If codec arg is provided, find codec using logic similar to FFmpeg:
+   // https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835
+@@ -846,7 +847,8 @@ void VideoEncoder::initializeEncoder(
+   if (videoStreamOptions.pixelFormat.has_value()) {
+     // TODO-VideoEncoder: (P2) Enable pixel formats to be set by user on GPU
+     // and handled with the appropriate NPP function on GPU.
+-    if (frames_.device().type() == kStableCUDA) {
++    if (frames_.device().type() == kStableCUDA ||
++        frames_.device().type() == kStableXPU) {
+       STD_TORCH_CHECK(
+           false,
+           "Video encoding on GPU currently only supports the nv12 pixel format. "
+@@ -855,7 +857,8 @@ void VideoEncoder::initializeEncoder(
+     outPixelFormat =
+         validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value());
+   } else {
+-    if (frames_.device().type() == kStableCUDA) {
++    if (frames_.device().type() == kStableCUDA ||
++        frames_.device().type() == kStableXPU) {
+       // Default to nv12 pixel format when encoding on GPU.
+       outPixelFormat = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT;
+     } else {
+@@ -910,7 +913,8 @@ void VideoEncoder::initializeEncoder(
+         0);
+   }
+ 
+-  if (frames_.device().type() == kStableCUDA) {
++  if (frames_.device().type() == kStableCUDA ||
++      frames_.device().type() == kStableXPU) {
+     deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get());
+     deviceInterface_->setupHardwareFrameContextForEncoding(
+         avCodecContext_.get());
+@@ -1124,7 +1128,8 @@ void MultiStreamEncoder::addVideoStream(
+   // The default CUDA interface is decode-only; encoders need the FFmpeg-based
+   // one.
+   videoStream_->deviceInterface = createDeviceInterface(
+-      stableDevice, stableDevice.type() == kStableCUDA ? "ffmpeg" : "default");
++      stableDevice,
++      stableDevice.type() == kStableCUDA ? "ffmpeg" : "default");
+   videoStream_->inHeight = height;
+   videoStream_->inWidth = width;
+   videoStream_->inFrameRate = frameRate;
+@@ -1208,7 +1213,7 @@ void MultiStreamEncoder::initializeVideoStream() {
+   if (videoStream.options.pixelFormat.has_value()) {
+     // TODO-MultiStreamEncoder: (P2) Enable pixel formats to be set by user on
+     // GPU and handled with the appropriate NPP function on GPU.
+-    if (deviceType == kStableCUDA) {
++    if (deviceType == kStableCUDA || deviceType == kStableXPU) {
+       STD_TORCH_CHECK(
+           false,
+           "Video encoding on GPU currently only supports the nv12 pixel format. "
+@@ -1217,7 +1222,7 @@ void MultiStreamEncoder::initializeVideoStream() {
+     outPixelFormat =
+         validatePixelFormat(*avCodec, videoStream.options.pixelFormat.value());
+   } else {
+-    if (deviceType == kStableCUDA) {
++    if (deviceType == kStableCUDA || deviceType == kStableXPU) {
+       // Default to nv12 pixel format when encoding on GPU.
+       outPixelFormat = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT;
+     } else {
+@@ -1275,7 +1280,7 @@ void MultiStreamEncoder::initializeVideoStream() {
+         0);
+   }
+ 
+-  if (deviceType == kStableCUDA) {
++  if (deviceType == kStableCUDA || deviceType == kStableXPU) {
+     videoStream.deviceInterface->registerHardwareDeviceWithCodec(
+         videoStream.avCodecContext.get());
+     videoStream.deviceInterface->setupHardwareFrameContextForEncoding(
diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp b/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp
index 01d25e2..eea9efa 100644
--- a/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp
+++ b/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp
@@ -9,6 +9,9 @@ namespace facebook::torchcodec {
 
 using float3x3 = std::array<sycl::float3, 3>;
 
+// ============================================================
+// Decoding matrices: YCbCr -> RGB  (used by NV12toRGBKernel)
+// ============================================================
 struct rgb_matrix {
   static constexpr float3x3 BT709 = {
     sycl::float3{ 1.0, 0.0, 1.5748 },
@@ -23,6 +26,26 @@ struct rgb_matrix {
   };
 };
 
+// ============================================================
+// Encoding matrices: RGB -> YCbCr  (used by RGB24toNV12Kernel)
+// Inverse of rgb_matrix above.
+// Row 0: Y  coefficients
+// Row 1: Cb coefficients
+// Row 2: Cr coefficients
+// ============================================================
+struct yuv_matrix {
+  static constexpr float3x3 BT709 = {
+    sycl::float3{  0.2126f,    0.7152f,    0.0722f  },  // Y
+    sycl::float3{ -0.1146f,   -0.3854f,    0.5f     },  // Cb
+    sycl::float3{  0.5f,      -0.4542f,   -0.0458f  }   // Cr
+  };
+  static constexpr float3x3 BT601 = {
+    sycl::float3{  0.299f,     0.587f,     0.114f   },  // Y
+    sycl::float3{ -0.168736f, -0.331264f,  0.5f     },  // Cb
+    sycl::float3{  0.5f,      -0.418688f, -0.081312f}   // Cr
+  };
+};
+
 // Helper function for the Intel Tile-Y offset calculation
 // Intel Y-Tiling uses COLUMN-MAJOR OWord (16 bytes) organization
 // Tile: 128 bytes wide × 32 rows = 4KB
@@ -166,6 +189,13 @@ const float3x3 getColorConversionMatrix(enum AVColorSpace colorspace) {
   return rgb_matrix::BT601;
 }
 
+const float3x3 getYUVConversionMatrix(enum AVColorSpace colorspace) {
+  if (colorspace == AVCOL_SPC_BT709) {
+    return yuv_matrix::BT709;
+  }
+  return yuv_matrix::BT601;
+}
+
 void convertNV12ToRGB(
     sycl::queue& queue,
     const uint8_t* y_plane,
@@ -201,5 +231,118 @@ void registerColorConversionKernel() {
   (void)s;
 }
 
+// ============================================================
+// Encoding kernel: NCHW RGB tensor -> NV12 VAAPI surface
+// ============================================================
+struct RGB24toNV12Kernel {
+  const uint8_t* rgb_nchw;  // CHW uint8 device pointer (R, G, B planes)
+  int64_t ch_stride;        // stride between channel planes
+  int64_t row_stride;       // stride between rows within a plane
+  int64_t pixel_stride;     // stride between adjacent pixels (1 for NCHW, 3 for HWC-permuted)
+  uint8_t* y_plane;
+  uint8_t* uv_plane;
+  int width;
+  int height;
+  int y_pitch;              // surface Y-plane row pitch in bytes
+  int uv_pitch;             // surface UV-plane row pitch in bytes
+  bool is_tiled;            // true → Tile-Y; false → linear
+  bool fullrange;
+  float3x3 yuv_mat;
+
+  RGB24toNV12Kernel(
+      const uint8_t* rgb_nchw_,
+      int64_t ch_stride_,
+      int64_t row_stride_,
+      int64_t pixel_stride_,
+      uint8_t* y_plane_,
+      uint8_t* uv_plane_,
+      int width_,
+      int height_,
+      int y_pitch_,
+      int uv_pitch_,
+      bool is_tiled_,
+      bool fullrange_,
+      const float3x3& yuv_mat_)
+    : rgb_nchw(rgb_nchw_),
+      ch_stride(ch_stride_),
+      row_stride(row_stride_),
+      pixel_stride(pixel_stride_),
+      y_plane(y_plane_),
+      uv_plane(uv_plane_),
+      width(width_),
+      height(height_),
+      y_pitch(y_pitch_),
+      uv_pitch(uv_pitch_),
+      is_tiled(is_tiled_),
+      fullrange(fullrange_),
+      yuv_mat(yuv_mat_)
+  {}
+
+  void operator()(sycl::id<2> idx) const {
+    int x = idx[1];
+    int y = idx[0];
+
+    if (x >= width || y >= height) {
+      return;
+    }
+
+    // Read RGB from NCHW tensor.
+    float r = rgb_nchw[0 * ch_stride + y * row_stride + x * pixel_stride] / 255.0f;
+    float g = rgb_nchw[1 * ch_stride + y * row_stride + x * pixel_stride] / 255.0f;
+    float b = rgb_nchw[2 * ch_stride + y * row_stride + x * pixel_stride] / 255.0f;
+    sycl::float3 src{r, g, b};
+
+    // Luma Y — write to Tile-Y or linear destination
+    float Y_norm = sycl::dot(src, yuv_mat[0]);
+    float Y = fullrange ? Y_norm * 255.0f : 16.0f + Y_norm * 219.0f;
+    size_t y_dst = is_tiled ? get_tile_offset(x, y, y_pitch)
+                            : (size_t)y * y_pitch + x;
+    y_plane[y_dst] = (uint8_t)std::clamp(Y, 0.0f, 255.0f);
+
+    // Chroma UV: one pair per 2x2 block (NV12 4:2:0 subsampling).
+    if ((x % 2 == 0) && (y % 2 == 0)) {
+      float Cb_norm = sycl::dot(src, yuv_mat[1]);
+      float Cr_norm = sycl::dot(src, yuv_mat[2]);
+      float U = fullrange ? Cb_norm * 255.0f + 128.0f : 128.0f + Cb_norm * 224.0f;
+      float V = fullrange ? Cr_norm * 255.0f + 128.0f : 128.0f + Cr_norm * 224.0f;
+      size_t u_dst = is_tiled ? get_tile_offset(x,     y / 2, uv_pitch)
+                              : (size_t)(y / 2) * uv_pitch + x;
+      size_t v_dst = is_tiled ? get_tile_offset(x + 1, y / 2, uv_pitch)
+                              : (size_t)(y / 2) * uv_pitch + x + 1;
+      uv_plane[u_dst] = (uint8_t)std::clamp(U, 0.0f, 255.0f);
+      uv_plane[v_dst] = (uint8_t)std::clamp(V, 0.0f, 255.0f);
+    }
+  }
+};
+
+void convertRGBToNV12(
+    sycl::queue& queue,
+    const uint8_t* rgb_nchw,
+    int64_t ch_stride,
+    int64_t row_stride,
+    int64_t pixel_stride,
+    uint8_t* dst_y,
+    uint8_t* dst_uv,
+    int width,
+    int height,
+    int y_pitch,
+    int uv_pitch,
+    bool is_tiled,
+    enum AVColorRange color_range,
+    enum AVColorSpace colorspace) {
+  bool fullrange = (color_range == AVCOL_RANGE_JPEG);
+  queue.submit([&](sycl::handler& cgh) {
+    RGB24toNV12Kernel kernel(
+        rgb_nchw, ch_stride, row_stride, pixel_stride,
+        dst_y, dst_uv,
+        width, height,
+        y_pitch, uv_pitch,
+        is_tiled,
+        fullrange, getYUVConversionMatrix(colorspace));
+    cgh.parallel_for(sycl::range<2>(height, width), kernel);
+  });
+  queue.wait();
+}
+
 } // namespace facebook::torchcodec
 #endif // WITH_SYCL_KERNELS
diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.h b/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.h
index e063397..e4f3c40 100644
--- a/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.h
+++ b/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.h
@@ -24,6 +24,24 @@ void convertNV12ToRGB(
     enum AVColorRange color_range,
     enum AVColorSpace colorspace);
 
+// Encoding: NCHW uint8 RGB tensor (on XPU) -> NV12 VAAPI surface.
+// is_tiled: true for Intel Tile-Y surfaces (drm_format_modifier != 0), false for linear.
+void convertRGBToNV12(
+    sycl::queue& queue,
+    const uint8_t* rgb_nchw,
+    int64_t ch_stride,
+    int64_t row_stride,
+    int64_t pixel_stride,
+    uint8_t* dst_y,
+    uint8_t* dst_uv,
+    int width,
+    int height,
+    int y_pitch,
+    int uv_pitch,
+    bool is_tiled,
+    enum AVColorRange color_range,
+    enum AVColorSpace colorspace);
+
 // Anchor function to force kernel registration
 void registerColorConversionKernel();
 
diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp
index e017bd5..69318cb 100644
--- a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp
+++ b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp
@@ -5,6 +5,7 @@
 #include <stdlib.h>
 #include <string>
 #include <unordered_map>
+#include <iostream>
 
 #include <level_zero/ze_api.h>
 #include <va/va_drmcommon.h>
@@ -569,4 +570,261 @@ std::optional<const AVCodec*> XpuDeviceInterface::findCodec(
   return std::nullopt;
 }
 
+// ============================================================
+// Encoding: setupHardwareFrameContextForEncoding
+// ============================================================
+// Allocates and initializes a VAAPI hw_frames_ctx on the codec context.
+// Mirrors the CUDA implementation in CudaDeviceInterface.cpp, with
+// AV_PIX_FMT_CUDA -> AV_PIX_FMT_VAAPI as the only pixel format change.
+void XpuDeviceInterface::setupHardwareFrameContextForEncoding(
+    AVCodecContext* codecContext) {
+  TORCH_CHECK(
+      ctx_,
+      "VAAPI hw device context is not initialized. "
+      "This device may not have a media engine (e.g. PVC/Ponte Vecchio). "
+      "Encoding via XPU is only supported on devices with VAAPI.");
+  TORCH_CHECK(codecContext != nullptr, "codecContext is null");
+
+  AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(ctx_.get());
+  TORCH_CHECK(
+      hwFramesCtxRef != nullptr,
+      "Failed to allocate VAAPI hw frames context for codec");
+
+  // sw_pix_fmt: the software (CPU-accessible) format the encoder consumes inside the surface
+  // pix_fmt:    the hardware wrapper format the codec sees (must match hw_frames_ctx->format)
+  codecContext->sw_pix_fmt = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT; // AV_PIX_FMT_NV12
+  codecContext->pix_fmt    = AV_PIX_FMT_VAAPI;
+
+  auto* hwFramesCtx = reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
+  hwFramesCtx->format    = AV_PIX_FMT_VAAPI;
+  hwFramesCtx->sw_format = AV_PIX_FMT_NV12;
+  hwFramesCtx->width     = codecContext->width;
+  hwFramesCtx->height    = codecContext->height;
+
+  int ret = av_hwframe_ctx_init(hwFramesCtxRef);
+  if (ret < 0) {
+    av_buffer_unref(&hwFramesCtxRef);
+    TORCH_CHECK(
+        false,
+        "Failed to initialize VAAPI hw frames context: ",
+        getFFMPEGErrorStringFromErrorCode(ret));
+  }
+  codecContext->hw_frames_ctx = hwFramesCtxRef;
+}
+
+// ============================================================
+// Encoding: convertTensorToAVFrameForEncoding
+// ============================================================
+UniqueAVFrame XpuDeviceInterface::convertTensorToAVFrameForEncoding(
+    const torch::stable::Tensor& tensor,
+    int frameIndex,
+    AVCodecContext* codecContext) {
+  TORCH_CHECK(
+      tensor.dim() == 3 && tensor.sizes()[0] == 3,
+      "Expected CHW tensor with 3 channels (RGB), got shape: ",
+      tensor.sizes()[0], "x", tensor.sizes()[1], "x", tensor.sizes()[2]);
+  TORCH_CHECK(codecContext != nullptr, "codecContext is null");
+  TORCH_CHECK(
+      codecContext->hw_frames_ctx != nullptr,
+      "hw_frames_ctx is null: call setupHardwareFrameContextForEncoding first");
+
+  UniqueAVFrame vaFrame(av_frame_alloc());
+  TORCH_CHECK(vaFrame != nullptr, "Failed to allocate AVFrame for encoding");
+  vaFrame->format = AV_PIX_FMT_VAAPI;
+  vaFrame->height = static_cast<int>(tensor.sizes()[1]);
+  vaFrame->width  = static_cast<int>(tensor.sizes()[2]);
+  vaFrame->pts    = frameIndex;
+
+  // Allocate a VAAPI surface from the hw_frames_ctx pool created in
+  // setupHardwareFrameContextForEncoding.
+  int ret = av_hwframe_get_buffer(codecContext->hw_frames_ctx, vaFrame.get(), 0);
+  TORCH_CHECK(
+      ret >= 0,
+      "av_hwframe_get_buffer failed: ",
+      getFFMPEGErrorStringFromErrorCode(ret));
+
+#ifdef WITH_SYCL_KERNELS
+  if (xpu::use_sycl_color_conversion_kernel()) {
+    VLOG(9) << "[XPU Encoder] Encoding frame " << frameIndex
+            << " via SYCL on device=xpu:" << device_.index();
+    return encodeConvert_SYCL(tensor, codecContext, std::move(vaFrame));
+  }
+#endif
+  VLOG(9) << "[XPU Encoder] Encoding frame " << frameIndex << " via CPU fallback";
+  return encodeConvert_CPU(tensor, codecContext, std::move(vaFrame));
+}
+
+// ============================================================
+// Encoding: encodeConvert_SYCL
+// ============================================================
+UniqueAVFrame XpuDeviceInterface::encodeConvert_SYCL(
+    const torch::stable::Tensor& tensor,
+    AVCodecContext* codecContext,
+    UniqueAVFrame vaFrame) {
+#ifdef WITH_SYCL_KERNELS
+  VADisplay display = getVaDisplayFromAV(vaFrame.get());
+  VASurfaceID surfaceId = (VASurfaceID)(uintptr_t)vaFrame->data[3];
+
+  VADRMPRIMESurfaceDescriptor desc{};
+  VAStatus sts = vaExportSurfaceHandle(
+      display,
+      surfaceId,
+      VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2,
+      VA_EXPORT_SURFACE_WRITE_ONLY,  // write for encoding (vs. READ_ONLY for decoding)
+      &desc);
+  TORCH_CHECK(
+      sts == VA_STATUS_SUCCESS,
+      "vaExportSurfaceHandle (WRITE_ONLY) failed: ",
+      vaErrorStr(sts));
+  TORCH_CHECK(desc.num_objects == 1, "Expected 1 DMA-BUF object, got ", desc.num_objects);
+  // NV12 surfaces can be exported in two valid layouts depending on the driver:
+  //  Layout A: 1 layer,  2 planes  — layers[0].planes[0]=Y, layers[0].planes[1]=UV
+  //  Layout B: 2 layers, 1 plane each — layers[0].planes[0]=Y, layers[1].planes[0]=UV
+  const bool layoutA = (desc.num_layers == 1 && desc.layers[0].num_planes == 2);
+  const bool layoutB = (desc.num_layers == 2 && desc.layers[0].num_planes == 1
+                        && desc.layers[1].num_planes == 1);
+  TORCH_CHECK(
+      layoutA || layoutB,
+      "Unsupported NV12 export layout: num_layers=", desc.num_layers,
+      " layers[0].num_planes=", desc.layers[0].num_planes);
+  // Get Level Zero context and device handles via SYCL interop.
+  sycl::queue queue = c10::xpu::getCurrentXPUStream(device_.index());
+  ze_context_handle_t zeCtx  = nullptr;
+  ze_device_handle_t  zeDevice = nullptr;
+  queue
+      .submit([&](sycl::handler& cgh) {
+        cgh.host_task([&](const sycl::interop_handle& ih) {
+          zeCtx    = ih.get_native_context<sycl::backend::ext_oneapi_level_zero>();
+          zeDevice = ih.get_native_device<sycl::backend::ext_oneapi_level_zero>();
+        });
+      })
+      .wait();
+
+  ze_external_memory_import_fd_t import_fd_desc{};
+  import_fd_desc.stype = ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD;
+  import_fd_desc.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF;
+  import_fd_desc.fd    = desc.objects[0].fd;
+
+  ze_device_mem_alloc_desc_t alloc_desc{};
+  alloc_desc.pNext = &import_fd_desc;
+  void* usm_ptr = nullptr;
+  ze_result_t res = zeMemAllocDevice(
+      zeCtx, &alloc_desc, desc.objects[0].size, 0, zeDevice, &usm_ptr);
+  TORCH_CHECK(
+      res == ZE_RESULT_SUCCESS,
+      "zeMemAllocDevice failed importing encode surface fd=",
+      desc.objects[0].fd);
+
+  // Extract Y and UV plane pointers and pitches for both layouts
+  uint8_t* y_ptr;
+  uint8_t* uv_ptr;
+  int y_pitch, uv_pitch;
+  if (layoutA) {
+    y_ptr    = static_cast<uint8_t*>(usm_ptr) + desc.layers[0].offset[0];
+    uv_ptr   = static_cast<uint8_t*>(usm_ptr) + desc.layers[0].offset[1];
+    y_pitch  = static_cast<int>(desc.layers[0].pitch[0]);
+    uv_pitch = static_cast<int>(desc.layers[0].pitch[1]);
+  } else {
+    y_ptr    = static_cast<uint8_t*>(usm_ptr) + desc.layers[0].offset[0];
+    uv_ptr   = static_cast<uint8_t*>(usm_ptr) + desc.layers[1].offset[0];
+    y_pitch  = static_cast<int>(desc.layers[0].pitch[0]);
+    uv_pitch = static_cast<int>(desc.layers[1].pitch[0]);
+  }
+
+  // drm_format_modifier != 0 means tiled (e.g. Intel Tile-Y on BMG/Gen12+).
+  const bool is_tiled = (desc.objects[0].drm_format_modifier != 0);
+  convertRGBToNV12(
+      queue,
+      static_cast<const uint8_t*>(tensor.data_ptr()),
+      tensor.strides()[0],   // ch_stride
+      tensor.strides()[1],   // row_stride
+      tensor.strides()[2],   // pixel_stride
+      y_ptr,
+      uv_ptr,
+      vaFrame->width,
+      vaFrame->height,
+      y_pitch,
+      uv_pitch,
+      is_tiled,
+      codecContext->color_range,
+      codecContext->colorspace);
+
+  zeMemFree(zeCtx, usm_ptr);
+  close(desc.objects[0].fd);
+
+  vaFrame->colorspace  = codecContext->colorspace;
+  vaFrame->color_range = codecContext->color_range;
+  return vaFrame;
+#else
+  return encodeConvert_CPU(tensor, codecContext, std::move(vaFrame));
+#endif
+}
+
+// ============================================================
+// Encoding: encodeConvert_CPU  (CPU fallback)
+// ============================================================
+UniqueAVFrame XpuDeviceInterface::encodeConvert_CPU(
+    const torch::stable::Tensor& tensor,
+    AVCodecContext* codecContext,
+    UniqueAVFrame vaFrame) {
+  // Move XPU tensor to CPU (blocking)
+  torch::stable::Tensor cpuTensor =
+      torch::stable::to(tensor, StableDevice(kStableCPU, 0));
+
+  const uint8_t* data = static_cast<const uint8_t*>(cpuTensor.data_ptr());
+  // strides() are in elements (uint8), so they equal byte strides here.
+  int64_t ch_stride  = cpuTensor.strides()[0];
+  int64_t row_stride = cpuTensor.strides()[1];
+
+  // Allocate an intermediate CPU NV12 frame for sws_scale output
+  UniqueAVFrame cpuFrame(av_frame_alloc());
+  TORCH_CHECK(cpuFrame != nullptr, "Failed to allocate CPU NV12 AVFrame");
+  cpuFrame->format = AV_PIX_FMT_NV12;
+  cpuFrame->width  = vaFrame->width;
+  cpuFrame->height = vaFrame->height;
+  int ret = av_frame_get_buffer(cpuFrame.get(), 0);
+  TORCH_CHECK(ret >= 0, "av_frame_get_buffer (NV12) failed: ",
+              getFFMPEGErrorStringFromErrorCode(ret));
+
+  // Zero-copy GBRP view of the NCHW tensor (GBRP plane order: G=ch1, B=ch2, R=ch0).
+  UniqueAVFrame gbrpFrame(av_frame_alloc());
+  TORCH_CHECK(gbrpFrame != nullptr, "Failed to allocate GBRP AVFrame");
+  gbrpFrame->format = AV_PIX_FMT_GBRP;
+  gbrpFrame->width  = vaFrame->width;
+  gbrpFrame->height = vaFrame->height;
+  gbrpFrame->data[0] = const_cast<uint8_t*>(data + 1 * ch_stride);  // G
+  gbrpFrame->data[1] = const_cast<uint8_t*>(data + 2 * ch_stride);  // B
+  gbrpFrame->data[2] = const_cast<uint8_t*>(data + 0 * ch_stride);  // R
+  gbrpFrame->linesize[0] = static_cast<int>(row_stride);
+  gbrpFrame->linesize[1] = static_cast<int>(row_stride);
+  gbrpFrame->linesize[2] = static_cast<int>(row_stride);
+
+  // GBRP -> NV12 via libswscale
+  SwsContext* swsCtx = sws_getContext(
+      vaFrame->width, vaFrame->height, AV_PIX_FMT_GBRP,
+      vaFrame->width, vaFrame->height, AV_PIX_FMT_NV12,
+      SWS_BILINEAR, nullptr, nullptr, nullptr);
+  TORCH_CHECK(swsCtx != nullptr, "sws_getContext(GBRP->NV12) failed");
+  sws_scale(
+      swsCtx,
+      gbrpFrame->data,
+      gbrpFrame->linesize,
+      0,
+      vaFrame->height,
+      cpuFrame->data,
+      cpuFrame->linesize);
+  sws_freeContext(swsCtx);
+
+  // Upload CPU NV12 -> VAAPI surface
+  ret = av_hwframe_transfer_data(vaFrame.get(), cpuFrame.get(), 0);
+  TORCH_CHECK(
+      ret >= 0,
+      "av_hwframe_transfer_data (NV12->VAAPI) failed: ",
+      getFFMPEGErrorStringFromErrorCode(ret));
+
+  vaFrame->colorspace  = codecContext->colorspace;
+  vaFrame->color_range = codecContext->color_range;
+  return vaFrame;
+}
+
 } // namespace facebook::torchcodec
diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h
index 71b7566..682a312 100644
--- a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h
+++ b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h
@@ -32,6 +32,15 @@ class XpuDeviceInterface : public DeviceInterface {
 
   void registerHardwareDeviceWithCodec(AVCodecContext* codecContext) override;
 
+  // ---- Encoding overrides ----
+  UniqueAVFrame convertTensorToAVFrameForEncoding(
+      const torch::stable::Tensor& tensor,
+      int frameIndex,
+      AVCodecContext* codecContext) override;
+
+  void setupHardwareFrameContextForEncoding(
+      AVCodecContext* codecContext) override;
+
   void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
@@ -64,6 +73,20 @@ class XpuDeviceInterface : public DeviceInterface {
   void convertAVFrameToFrameOutput_FilterGraph(
       UniqueAVFrame& avFrame,
       torch::stable::Tensor& dst);
+
+  // ---- Encoding helpers ----
+  // SYCL path: exports VAAPI surface as DMA-BUF, imports via Level Zero USM,
+  // runs convertRGBToNV12 kernel directly on the surface memory.
+  UniqueAVFrame encodeConvert_SYCL(
+      const torch::stable::Tensor& tensor,
+      AVCodecContext* codecContext,
+      UniqueAVFrame vaFrame);
+  // CPU fallback: moves tensor to CPU, uses libswscale GBRP->NV12,
+  // then av_hwframe_transfer_data to upload into the VAAPI surface.
+  UniqueAVFrame encodeConvert_CPU(
+      const torch::stable::Tensor& tensor,
+      AVCodecContext* codecContext,
+      UniqueAVFrame vaFrame);
 };
 
 } // namespace facebook::torchcodec

From ad842da81745a42591441e5fd7ab58cb8d30a321 Mon Sep 17 00:00:00 2001
From: Edgar Romo Montiel <edgar.romo.montiel@intel.com>
Date: Mon, 25 May 2026 14:48:37 -0700
Subject: [PATCH 2/3] Update patch and constant for context sw_pixel conversion
 in XPU case.

Signed-off-by: Edgar Romo Montiel <edgar.romo.montiel@intel.com>
---
 ...-Add-XPU-encoding-support-to-Encoder.patch | 32 ++++---------------
 .../src/torchcodec_xpu/XpuDeviceInterface.cpp |  8 ++---
 2 files changed, 9 insertions(+), 31 deletions(-)

diff --git a/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch b/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch
index 65be795..d11c40c 100644
--- a/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch
+++ b/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch
@@ -2,17 +2,7 @@ diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
 index 2fe136fb..d024834b 100644
 --- a/src/torchcodec/_core/Encoder.cpp
 +++ b/src/torchcodec/_core/Encoder.cpp
-@@ -787,7 +787,8 @@ void VideoEncoder::initializeEncoder(
-   // The default CUDA interface is decode-only; encoders need the FFmpeg-based
-   // one.
-   deviceInterface_ = createDeviceInterface(
--      stableDevice, stableDevice.type() == kStableCUDA ? "ffmpeg" : "default");
-+      stableDevice,
-+      stableDevice.type() == kStableCUDA ? "ffmpeg" : "default");
-   const AVCodec* avCodec = nullptr;
-   // If codec arg is provided, find codec using logic similar to FFmpeg:
-   // https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835
-@@ -846,7 +847,8 @@ void VideoEncoder::initializeEncoder(
+@@ -846,7 +846,8 @@ void VideoEncoder::initializeEncoder(
    if (videoStreamOptions.pixelFormat.has_value()) {
      // TODO-VideoEncoder: (P2) Enable pixel formats to be set by user on GPU
      // and handled with the appropriate NPP function on GPU.
@@ -22,7 +12,7 @@ index 2fe136fb..d024834b 100644
        STD_TORCH_CHECK(
            false,
            "Video encoding on GPU currently only supports the nv12 pixel format. "
-@@ -855,7 +857,8 @@ void VideoEncoder::initializeEncoder(
+@@ -855,7 +856,8 @@ void VideoEncoder::initializeEncoder(
      outPixelFormat =
          validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value());
    } else {
@@ -32,7 +22,7 @@ index 2fe136fb..d024834b 100644
        // Default to nv12 pixel format when encoding on GPU.
        outPixelFormat = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT;
      } else {
-@@ -910,7 +913,8 @@ void VideoEncoder::initializeEncoder(
+@@ -910,7 +912,8 @@ void VideoEncoder::initializeEncoder(
          0);
    }
  
@@ -42,17 +32,7 @@ index 2fe136fb..d024834b 100644
      deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get());
      deviceInterface_->setupHardwareFrameContextForEncoding(
          avCodecContext_.get());
-@@ -1124,7 +1128,8 @@ void MultiStreamEncoder::addVideoStream(
-   // The default CUDA interface is decode-only; encoders need the FFmpeg-based
-   // one.
-   videoStream_->deviceInterface = createDeviceInterface(
--      stableDevice, stableDevice.type() == kStableCUDA ? "ffmpeg" : "default");
-+      stableDevice,
-+      stableDevice.type() == kStableCUDA ? "ffmpeg" : "default");
-   videoStream_->inHeight = height;
-   videoStream_->inWidth = width;
-   videoStream_->inFrameRate = frameRate;
-@@ -1208,7 +1213,7 @@ void MultiStreamEncoder::initializeVideoStream() {
+@@ -1208,7 +1211,7 @@ void MultiStreamEncoder::initializeVideoStream() {
    if (videoStream.options.pixelFormat.has_value()) {
      // TODO-MultiStreamEncoder: (P2) Enable pixel formats to be set by user on
      // GPU and handled with the appropriate NPP function on GPU.
@@ -61,7 +41,7 @@ index 2fe136fb..d024834b 100644
        STD_TORCH_CHECK(
            false,
            "Video encoding on GPU currently only supports the nv12 pixel format. "
-@@ -1217,7 +1222,7 @@ void MultiStreamEncoder::initializeVideoStream() {
+@@ -1217,7 +1220,7 @@ void MultiStreamEncoder::initializeVideoStream() {
      outPixelFormat =
          validatePixelFormat(*avCodec, videoStream.options.pixelFormat.value());
    } else {
@@ -70,7 +50,7 @@ index 2fe136fb..d024834b 100644
        // Default to nv12 pixel format when encoding on GPU.
        outPixelFormat = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT;
      } else {
-@@ -1275,7 +1280,7 @@ void MultiStreamEncoder::initializeVideoStream() {
+@@ -1275,7 +1278,7 @@ void MultiStreamEncoder::initializeVideoStream() {
          0);
    }
  
diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp
index 69318cb..09b4b6f 100644
--- a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp
+++ b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp
@@ -5,7 +5,6 @@
 #include <stdlib.h>
 #include <string>
 #include <unordered_map>
-#include <iostream>
 
 #include <level_zero/ze_api.h>
 #include <va/va_drmcommon.h>
@@ -573,9 +572,8 @@ std::optional<const AVCodec*> XpuDeviceInterface::findCodec(
 // ============================================================
 // Encoding: setupHardwareFrameContextForEncoding
 // ============================================================
-// Allocates and initializes a VAAPI hw_frames_ctx on the codec context.
-// Mirrors the CUDA implementation in CudaDeviceInterface.cpp, with
-// AV_PIX_FMT_CUDA -> AV_PIX_FMT_VAAPI as the only pixel format change.
+// Allocates a VAAPI hw_frames_ctx on the codec context so the encoder
+// can write directly into VAAPI surfaces (NV12 layout, VAAPI wrapper).
 void XpuDeviceInterface::setupHardwareFrameContextForEncoding(
     AVCodecContext* codecContext) {
   TORCH_CHECK(
@@ -592,7 +590,7 @@ void XpuDeviceInterface::setupHardwareFrameContextForEncoding(
 
   // sw_pix_fmt: the software (CPU-accessible) format the encoder consumes inside the surface
   // pix_fmt:    the hardware wrapper format the codec sees (must match hw_frames_ctx->format)
-  codecContext->sw_pix_fmt = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT; // AV_PIX_FMT_NV12
+  codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
   codecContext->pix_fmt    = AV_PIX_FMT_VAAPI;
 
   auto* hwFramesCtx = reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);

From 1ef834451d4f6e85bb1860b15e1013fb00769205 Mon Sep 17 00:00:00 2001
From: Edgar Romo Montiel <edgar.romo.montiel@intel.com>
Date: Tue, 26 May 2026 15:11:31 -0700
Subject: [PATCH 3/3] Update convolution matrix for color conversion for
 encoding

How to reproduce FFmpeg RGB->YUV matrix values

1. Expose ff_fill_rgb2yuv_table in libavfilter/libavfilter.v:
   add "ff_fill_rgb2yuv_table;" under the global section.
   Example:
	libavfilter/libavfilter.v
	 LIBAVFILTER_MAJOR {
	     global:
	         avfilter_*;
	         av_*;
	+        ff_fill_rgb2yuv_table;
	     local:
	         *;
	 };

2. Rebuild FFmpeg:
   cd ffmpeg && ./configure && make -j$(nproc) && make install
   nm -D <prefix>/lib/libavfilter.so | grep ff_fill_rgb2yuv_table

3. Create rgb2yuv_test.c calling
   ff_fill_rgb2yuv_table(av_csp_luma_coeffs_from_avcsp(cs), m)
   for AVCOL_SPC_BT709, BT470BG.

4. Build:
   gcc rgb2yuv_test.c -o rgb2yuv_test \
       -I<prefix>/include -L<prefix>/lib \
       -lavfilter -lavutil -Wl,-rpath,<prefix>/lib

Signed-off-by: Edgar Romo Montiel <edgar.romo.montiel@intel.com>
---
 .../src/torchcodec_xpu/ColorConversionKernel.cpp            | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp b/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp
index eea9efa..b789261 100644
--- a/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp
+++ b/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp
@@ -35,9 +35,9 @@ struct rgb_matrix {
 // ============================================================
 struct yuv_matrix {
   static constexpr float3x3 BT709 = {
-    sycl::float3{  0.2126f,    0.7152f,    0.0722f  },  // Y
-    sycl::float3{ -0.1146f,   -0.3854f,    0.5f     },  // Cb
-    sycl::float3{  0.5f,      -0.4542f,   -0.0458f  }   // Cr
+    sycl::float3{  0.212600f,   0.715200f,   0.072200f  },  // Y
+    sycl::float3{ -0.114572f,  -0.385428f,   0.5f       },  // Cb
+    sycl::float3{  0.5f,       -0.454153f,  -0.045847f  }   // Cr
   };
   static constexpr float3x3 BT601 = {
     sycl::float3{  0.299f,     0.587f,     0.114f   },  // Y