From f317814816383f86bfd647d2a6d7c31b2c9ed430 Mon Sep 17 00:00:00 2001 From: Edgar Romo Montiel Date: Wed, 20 May 2026 16:36:05 -0700 Subject: [PATCH 1/3] Add XPU encoding support to Encoder Signed-off-by: Edgar Romo Montiel --- ...-Add-XPU-encoding-support-to-Encoder.patch | 81 ++++++ .../torchcodec_xpu/ColorConversionKernel.cpp | 143 ++++++++++ .../torchcodec_xpu/ColorConversionKernel.h | 18 ++ .../src/torchcodec_xpu/XpuDeviceInterface.cpp | 258 ++++++++++++++++++ .../src/torchcodec_xpu/XpuDeviceInterface.h | 23 ++ 5 files changed, 523 insertions(+) create mode 100644 packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch diff --git a/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch b/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch new file mode 100644 index 0000000..65be795 --- /dev/null +++ b/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch @@ -0,0 +1,81 @@ +diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp +index 2fe136fb..d024834b 100644 +--- a/src/torchcodec/_core/Encoder.cpp ++++ b/src/torchcodec/_core/Encoder.cpp +@@ -787,7 +787,8 @@ void VideoEncoder::initializeEncoder( + // The default CUDA interface is decode-only; encoders need the FFmpeg-based + // one. + deviceInterface_ = createDeviceInterface( +- stableDevice, stableDevice.type() == kStableCUDA ? "ffmpeg" : "default"); ++ stableDevice, ++ stableDevice.type() == kStableCUDA ? "ffmpeg" : "default"); + const AVCodec* avCodec = nullptr; + // If codec arg is provided, find codec using logic similar to FFmpeg: + // https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835 +@@ -846,7 +847,8 @@ void VideoEncoder::initializeEncoder( + if (videoStreamOptions.pixelFormat.has_value()) { + // TODO-VideoEncoder: (P2) Enable pixel formats to be set by user on GPU + // and handled with the appropriate NPP function on GPU. +- if (frames_.device().type() == kStableCUDA) { ++ if (frames_.device().type() == kStableCUDA || ++ frames_.device().type() == kStableXPU) { + STD_TORCH_CHECK( + false, + "Video encoding on GPU currently only supports the nv12 pixel format. " +@@ -855,7 +857,8 @@ void VideoEncoder::initializeEncoder( + outPixelFormat = + validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value()); + } else { +- if (frames_.device().type() == kStableCUDA) { ++ if (frames_.device().type() == kStableCUDA || ++ frames_.device().type() == kStableXPU) { + // Default to nv12 pixel format when encoding on GPU. + outPixelFormat = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT; + } else { +@@ -910,7 +913,8 @@ void VideoEncoder::initializeEncoder( + 0); + } + +- if (frames_.device().type() == kStableCUDA) { ++ if (frames_.device().type() == kStableCUDA || ++ frames_.device().type() == kStableXPU) { + deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get()); + deviceInterface_->setupHardwareFrameContextForEncoding( + avCodecContext_.get()); +@@ -1124,7 +1128,8 @@ void MultiStreamEncoder::addVideoStream( + // The default CUDA interface is decode-only; encoders need the FFmpeg-based + // one. + videoStream_->deviceInterface = createDeviceInterface( +- stableDevice, stableDevice.type() == kStableCUDA ? "ffmpeg" : "default"); ++ stableDevice, ++ stableDevice.type() == kStableCUDA ? "ffmpeg" : "default"); + videoStream_->inHeight = height; + videoStream_->inWidth = width; + videoStream_->inFrameRate = frameRate; +@@ -1208,7 +1213,7 @@ void MultiStreamEncoder::initializeVideoStream() { + if (videoStream.options.pixelFormat.has_value()) { + // TODO-MultiStreamEncoder: (P2) Enable pixel formats to be set by user on + // GPU and handled with the appropriate NPP function on GPU. +- if (deviceType == kStableCUDA) { ++ if (deviceType == kStableCUDA || deviceType == kStableXPU) { + STD_TORCH_CHECK( + false, + "Video encoding on GPU currently only supports the nv12 pixel format. " +@@ -1217,7 +1222,7 @@ void MultiStreamEncoder::initializeVideoStream() { + outPixelFormat = + validatePixelFormat(*avCodec, videoStream.options.pixelFormat.value()); + } else { +- if (deviceType == kStableCUDA) { ++ if (deviceType == kStableCUDA || deviceType == kStableXPU) { + // Default to nv12 pixel format when encoding on GPU. + outPixelFormat = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT; + } else { +@@ -1275,7 +1280,7 @@ void MultiStreamEncoder::initializeVideoStream() { + 0); + } + +- if (deviceType == kStableCUDA) { ++ if (deviceType == kStableCUDA || deviceType == kStableXPU) { + videoStream.deviceInterface->registerHardwareDeviceWithCodec( + videoStream.avCodecContext.get()); + videoStream.deviceInterface->setupHardwareFrameContextForEncoding( diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp b/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp index 01d25e2..eea9efa 100644 --- a/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp +++ b/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp @@ -9,6 +9,9 @@ namespace facebook::torchcodec { using float3x3 = std::array; +// ============================================================ +// Decoding matrices: YCbCr -> RGB (used by NV12toRGBKernel) +// ============================================================ struct rgb_matrix { static constexpr float3x3 BT709 = { sycl::float3{ 1.0, 0.0, 1.5748 }, @@ -23,6 +26,26 @@ struct rgb_matrix { }; }; +// ============================================================ +// Encoding matrices: RGB -> YCbCr (used by RGB24toNV12Kernel) +// Inverse of rgb_matrix above. +// Row 0: Y coefficients +// Row 1: Cb coefficients +// Row 2: Cr coefficients +// ============================================================ +struct yuv_matrix { + static constexpr float3x3 BT709 = { + sycl::float3{ 0.2126f, 0.7152f, 0.0722f }, // Y + sycl::float3{ -0.1146f, -0.3854f, 0.5f }, // Cb + sycl::float3{ 0.5f, -0.4542f, -0.0458f } // Cr + }; + static constexpr float3x3 BT601 = { + sycl::float3{ 0.299f, 0.587f, 0.114f }, // Y + sycl::float3{ -0.168736f, -0.331264f, 0.5f }, // Cb + sycl::float3{ 0.5f, -0.418688f, -0.081312f} // Cr + }; +}; + // Helper function for the Intel Tile-Y offset calculation // Intel Y-Tiling uses COLUMN-MAJOR OWord (16 bytes) organization // Tile: 128 bytes wide × 32 rows = 4KB @@ -166,6 +189,13 @@ const float3x3 getColorConversionMatrix(enum AVColorSpace colorspace) { return rgb_matrix::BT601; } +const float3x3 getYUVConversionMatrix(enum AVColorSpace colorspace) { + if (colorspace == AVCOL_SPC_BT709) { + return yuv_matrix::BT709; + } + return yuv_matrix::BT601; +} + void convertNV12ToRGB( sycl::queue& queue, const uint8_t* y_plane, @@ -201,5 +231,118 @@ void registerColorConversionKernel() { (void)s; } +// ============================================================ +// Encoding kernel: NCHW RGB tensor -> NV12 VAAPI surface +// ============================================================ +struct RGB24toNV12Kernel { + const uint8_t* rgb_nchw; // CHW uint8 device pointer (R, G, B planes) + int64_t ch_stride; // stride between channel planes + int64_t row_stride; // stride between rows within a plane + int64_t pixel_stride; // stride between adjacent pixels (1 for NCHW, 3 for HWC-permuted) + uint8_t* y_plane; + uint8_t* uv_plane; + int width; + int height; + int y_pitch; // surface Y-plane row pitch in bytes + int uv_pitch; // surface UV-plane row pitch in bytes + bool is_tiled; // true → Tile-Y; false → linear + bool fullrange; + float3x3 yuv_mat; + + RGB24toNV12Kernel( + const uint8_t* rgb_nchw_, + int64_t ch_stride_, + int64_t row_stride_, + int64_t pixel_stride_, + uint8_t* y_plane_, + uint8_t* uv_plane_, + int width_, + int height_, + int y_pitch_, + int uv_pitch_, + bool is_tiled_, + bool fullrange_, + const float3x3& yuv_mat_) + : rgb_nchw(rgb_nchw_), + ch_stride(ch_stride_), + row_stride(row_stride_), + pixel_stride(pixel_stride_), + y_plane(y_plane_), + uv_plane(uv_plane_), + width(width_), + height(height_), + y_pitch(y_pitch_), + uv_pitch(uv_pitch_), + is_tiled(is_tiled_), + fullrange(fullrange_), + yuv_mat(yuv_mat_) + {} + + void operator()(sycl::id<2> idx) const { + int x = idx[1]; + int y = idx[0]; + + if (x >= width || y >= height) { + return; + } + + // Read RGB from NCHW tensor. + float r = rgb_nchw[0 * ch_stride + y * row_stride + x * pixel_stride] / 255.0f; + float g = rgb_nchw[1 * ch_stride + y * row_stride + x * pixel_stride] / 255.0f; + float b = rgb_nchw[2 * ch_stride + y * row_stride + x * pixel_stride] / 255.0f; + sycl::float3 src{r, g, b}; + + // Luma Y — write to Tile-Y or linear destination + float Y_norm = sycl::dot(src, yuv_mat[0]); + float Y = fullrange ? Y_norm * 255.0f : 16.0f + Y_norm * 219.0f; + size_t y_dst = is_tiled ? get_tile_offset(x, y, y_pitch) + : (size_t)y * y_pitch + x; + y_plane[y_dst] = (uint8_t)std::clamp(Y, 0.0f, 255.0f); + + // Chroma UV: one pair per 2x2 block (NV12 4:2:0 subsampling). + if ((x % 2 == 0) && (y % 2 == 0)) { + float Cb_norm = sycl::dot(src, yuv_mat[1]); + float Cr_norm = sycl::dot(src, yuv_mat[2]); + float U = fullrange ? Cb_norm * 255.0f + 128.0f : 128.0f + Cb_norm * 224.0f; + float V = fullrange ? Cr_norm * 255.0f + 128.0f : 128.0f + Cr_norm * 224.0f; + size_t u_dst = is_tiled ? get_tile_offset(x, y / 2, uv_pitch) + : (size_t)(y / 2) * uv_pitch + x; + size_t v_dst = is_tiled ? get_tile_offset(x + 1, y / 2, uv_pitch) + : (size_t)(y / 2) * uv_pitch + x + 1; + uv_plane[u_dst] = (uint8_t)std::clamp(U, 0.0f, 255.0f); + uv_plane[v_dst] = (uint8_t)std::clamp(V, 0.0f, 255.0f); + } + } +}; + +void convertRGBToNV12( + sycl::queue& queue, + const uint8_t* rgb_nchw, + int64_t ch_stride, + int64_t row_stride, + int64_t pixel_stride, + uint8_t* dst_y, + uint8_t* dst_uv, + int width, + int height, + int y_pitch, + int uv_pitch, + bool is_tiled, + enum AVColorRange color_range, + enum AVColorSpace colorspace) { + bool fullrange = (color_range == AVCOL_RANGE_JPEG); + queue.submit([&](sycl::handler& cgh) { + RGB24toNV12Kernel kernel( + rgb_nchw, ch_stride, row_stride, pixel_stride, + dst_y, dst_uv, + width, height, + y_pitch, uv_pitch, + is_tiled, + fullrange, getYUVConversionMatrix(colorspace)); + cgh.parallel_for(sycl::range<2>(height, width), kernel); + }); + queue.wait(); +} + } // namespace facebook::torchcodec #endif // WITH_SYCL_KERNELS diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.h b/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.h index e063397..e4f3c40 100644 --- a/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.h +++ b/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.h @@ -24,6 +24,24 @@ void convertNV12ToRGB( enum AVColorRange color_range, enum AVColorSpace colorspace); +// Encoding: NCHW uint8 RGB tensor (on XPU) -> NV12 VAAPI surface. +// is_tiled: true for Intel Tile-Y surfaces (drm_format_modifier != 0), false for linear. +void convertRGBToNV12( + sycl::queue& queue, + const uint8_t* rgb_nchw, + int64_t ch_stride, + int64_t row_stride, + int64_t pixel_stride, + uint8_t* dst_y, + uint8_t* dst_uv, + int width, + int height, + int y_pitch, + int uv_pitch, + bool is_tiled, + enum AVColorRange color_range, + enum AVColorSpace colorspace); + // Anchor function to force kernel registration void registerColorConversionKernel(); diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp index e017bd5..69318cb 100644 --- a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp +++ b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -569,4 +570,261 @@ std::optional XpuDeviceInterface::findCodec( return std::nullopt; } +// ============================================================ +// Encoding: setupHardwareFrameContextForEncoding +// ============================================================ +// Allocates and initializes a VAAPI hw_frames_ctx on the codec context. +// Mirrors the CUDA implementation in CudaDeviceInterface.cpp, with +// AV_PIX_FMT_CUDA -> AV_PIX_FMT_VAAPI as the only pixel format change. +void XpuDeviceInterface::setupHardwareFrameContextForEncoding( + AVCodecContext* codecContext) { + TORCH_CHECK( + ctx_, + "VAAPI hw device context is not initialized. " + "This device may not have a media engine (e.g. PVC/Ponte Vecchio). " + "Encoding via XPU is only supported on devices with VAAPI."); + TORCH_CHECK(codecContext != nullptr, "codecContext is null"); + + AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(ctx_.get()); + TORCH_CHECK( + hwFramesCtxRef != nullptr, + "Failed to allocate VAAPI hw frames context for codec"); + + // sw_pix_fmt: the software (CPU-accessible) format the encoder consumes inside the surface + // pix_fmt: the hardware wrapper format the codec sees (must match hw_frames_ctx->format) + codecContext->sw_pix_fmt = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT; // AV_PIX_FMT_NV12 + codecContext->pix_fmt = AV_PIX_FMT_VAAPI; + + auto* hwFramesCtx = reinterpret_cast(hwFramesCtxRef->data); + hwFramesCtx->format = AV_PIX_FMT_VAAPI; + hwFramesCtx->sw_format = AV_PIX_FMT_NV12; + hwFramesCtx->width = codecContext->width; + hwFramesCtx->height = codecContext->height; + + int ret = av_hwframe_ctx_init(hwFramesCtxRef); + if (ret < 0) { + av_buffer_unref(&hwFramesCtxRef); + TORCH_CHECK( + false, + "Failed to initialize VAAPI hw frames context: ", + getFFMPEGErrorStringFromErrorCode(ret)); + } + codecContext->hw_frames_ctx = hwFramesCtxRef; +} + +// ============================================================ +// Encoding: convertTensorToAVFrameForEncoding +// ============================================================ +UniqueAVFrame XpuDeviceInterface::convertTensorToAVFrameForEncoding( + const torch::stable::Tensor& tensor, + int frameIndex, + AVCodecContext* codecContext) { + TORCH_CHECK( + tensor.dim() == 3 && tensor.sizes()[0] == 3, + "Expected CHW tensor with 3 channels (RGB), got shape: ", + tensor.sizes()[0], "x", tensor.sizes()[1], "x", tensor.sizes()[2]); + TORCH_CHECK(codecContext != nullptr, "codecContext is null"); + TORCH_CHECK( + codecContext->hw_frames_ctx != nullptr, + "hw_frames_ctx is null: call setupHardwareFrameContextForEncoding first"); + + UniqueAVFrame vaFrame(av_frame_alloc()); + TORCH_CHECK(vaFrame != nullptr, "Failed to allocate AVFrame for encoding"); + vaFrame->format = AV_PIX_FMT_VAAPI; + vaFrame->height = static_cast(tensor.sizes()[1]); + vaFrame->width = static_cast(tensor.sizes()[2]); + vaFrame->pts = frameIndex; + + // Allocate a VAAPI surface from the hw_frames_ctx pool created in + // setupHardwareFrameContextForEncoding. + int ret = av_hwframe_get_buffer(codecContext->hw_frames_ctx, vaFrame.get(), 0); + TORCH_CHECK( + ret >= 0, + "av_hwframe_get_buffer failed: ", + getFFMPEGErrorStringFromErrorCode(ret)); + +#ifdef WITH_SYCL_KERNELS + if (xpu::use_sycl_color_conversion_kernel()) { + VLOG(9) << "[XPU Encoder] Encoding frame " << frameIndex + << " via SYCL on device=xpu:" << device_.index(); + return encodeConvert_SYCL(tensor, codecContext, std::move(vaFrame)); + } +#endif + VLOG(9) << "[XPU Encoder] Encoding frame " << frameIndex << " via CPU fallback"; + return encodeConvert_CPU(tensor, codecContext, std::move(vaFrame)); +} + +// ============================================================ +// Encoding: encodeConvert_SYCL +// ============================================================ +UniqueAVFrame XpuDeviceInterface::encodeConvert_SYCL( + const torch::stable::Tensor& tensor, + AVCodecContext* codecContext, + UniqueAVFrame vaFrame) { +#ifdef WITH_SYCL_KERNELS + VADisplay display = getVaDisplayFromAV(vaFrame.get()); + VASurfaceID surfaceId = (VASurfaceID)(uintptr_t)vaFrame->data[3]; + + VADRMPRIMESurfaceDescriptor desc{}; + VAStatus sts = vaExportSurfaceHandle( + display, + surfaceId, + VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2, + VA_EXPORT_SURFACE_WRITE_ONLY, // write for encoding (vs. READ_ONLY for decoding) + &desc); + TORCH_CHECK( + sts == VA_STATUS_SUCCESS, + "vaExportSurfaceHandle (WRITE_ONLY) failed: ", + vaErrorStr(sts)); + TORCH_CHECK(desc.num_objects == 1, "Expected 1 DMA-BUF object, got ", desc.num_objects); + // NV12 surfaces can be exported in two valid layouts depending on the driver: + // Layout A: 1 layer, 2 planes — layers[0].planes[0]=Y, layers[0].planes[1]=UV + // Layout B: 2 layers, 1 plane each — layers[0].planes[0]=Y, layers[1].planes[0]=UV + const bool layoutA = (desc.num_layers == 1 && desc.layers[0].num_planes == 2); + const bool layoutB = (desc.num_layers == 2 && desc.layers[0].num_planes == 1 + && desc.layers[1].num_planes == 1); + TORCH_CHECK( + layoutA || layoutB, + "Unsupported NV12 export layout: num_layers=", desc.num_layers, + " layers[0].num_planes=", desc.layers[0].num_planes); + // Get Level Zero context and device handles via SYCL interop. + sycl::queue queue = c10::xpu::getCurrentXPUStream(device_.index()); + ze_context_handle_t zeCtx = nullptr; + ze_device_handle_t zeDevice = nullptr; + queue + .submit([&](sycl::handler& cgh) { + cgh.host_task([&](const sycl::interop_handle& ih) { + zeCtx = ih.get_native_context(); + zeDevice = ih.get_native_device(); + }); + }) + .wait(); + + ze_external_memory_import_fd_t import_fd_desc{}; + import_fd_desc.stype = ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD; + import_fd_desc.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF; + import_fd_desc.fd = desc.objects[0].fd; + + ze_device_mem_alloc_desc_t alloc_desc{}; + alloc_desc.pNext = &import_fd_desc; + void* usm_ptr = nullptr; + ze_result_t res = zeMemAllocDevice( + zeCtx, &alloc_desc, desc.objects[0].size, 0, zeDevice, &usm_ptr); + TORCH_CHECK( + res == ZE_RESULT_SUCCESS, + "zeMemAllocDevice failed importing encode surface fd=", + desc.objects[0].fd); + + // Extract Y and UV plane pointers and pitches for both layouts + uint8_t* y_ptr; + uint8_t* uv_ptr; + int y_pitch, uv_pitch; + if (layoutA) { + y_ptr = static_cast(usm_ptr) + desc.layers[0].offset[0]; + uv_ptr = static_cast(usm_ptr) + desc.layers[0].offset[1]; + y_pitch = static_cast(desc.layers[0].pitch[0]); + uv_pitch = static_cast(desc.layers[0].pitch[1]); + } else { + y_ptr = static_cast(usm_ptr) + desc.layers[0].offset[0]; + uv_ptr = static_cast(usm_ptr) + desc.layers[1].offset[0]; + y_pitch = static_cast(desc.layers[0].pitch[0]); + uv_pitch = static_cast(desc.layers[1].pitch[0]); + } + + // drm_format_modifier != 0 means tiled (e.g. Intel Tile-Y on BMG/Gen12+). + const bool is_tiled = (desc.objects[0].drm_format_modifier != 0); + convertRGBToNV12( + queue, + static_cast(tensor.data_ptr()), + tensor.strides()[0], // ch_stride + tensor.strides()[1], // row_stride + tensor.strides()[2], // pixel_stride + y_ptr, + uv_ptr, + vaFrame->width, + vaFrame->height, + y_pitch, + uv_pitch, + is_tiled, + codecContext->color_range, + codecContext->colorspace); + + zeMemFree(zeCtx, usm_ptr); + close(desc.objects[0].fd); + + vaFrame->colorspace = codecContext->colorspace; + vaFrame->color_range = codecContext->color_range; + return vaFrame; +#else + return encodeConvert_CPU(tensor, codecContext, std::move(vaFrame)); +#endif +} + +// ============================================================ +// Encoding: encodeConvert_CPU (CPU fallback) +// ============================================================ +UniqueAVFrame XpuDeviceInterface::encodeConvert_CPU( + const torch::stable::Tensor& tensor, + AVCodecContext* codecContext, + UniqueAVFrame vaFrame) { + // Move XPU tensor to CPU (blocking) + torch::stable::Tensor cpuTensor = + torch::stable::to(tensor, StableDevice(kStableCPU, 0)); + + const uint8_t* data = static_cast(cpuTensor.data_ptr()); + // strides() are in elements (uint8), so they equal byte strides here. + int64_t ch_stride = cpuTensor.strides()[0]; + int64_t row_stride = cpuTensor.strides()[1]; + + // Allocate an intermediate CPU NV12 frame for sws_scale output + UniqueAVFrame cpuFrame(av_frame_alloc()); + TORCH_CHECK(cpuFrame != nullptr, "Failed to allocate CPU NV12 AVFrame"); + cpuFrame->format = AV_PIX_FMT_NV12; + cpuFrame->width = vaFrame->width; + cpuFrame->height = vaFrame->height; + int ret = av_frame_get_buffer(cpuFrame.get(), 0); + TORCH_CHECK(ret >= 0, "av_frame_get_buffer (NV12) failed: ", + getFFMPEGErrorStringFromErrorCode(ret)); + + // Zero-copy GBRP view of the NCHW tensor (GBRP plane order: G=ch1, B=ch2, R=ch0). + UniqueAVFrame gbrpFrame(av_frame_alloc()); + TORCH_CHECK(gbrpFrame != nullptr, "Failed to allocate GBRP AVFrame"); + gbrpFrame->format = AV_PIX_FMT_GBRP; + gbrpFrame->width = vaFrame->width; + gbrpFrame->height = vaFrame->height; + gbrpFrame->data[0] = const_cast(data + 1 * ch_stride); // G + gbrpFrame->data[1] = const_cast(data + 2 * ch_stride); // B + gbrpFrame->data[2] = const_cast(data + 0 * ch_stride); // R + gbrpFrame->linesize[0] = static_cast(row_stride); + gbrpFrame->linesize[1] = static_cast(row_stride); + gbrpFrame->linesize[2] = static_cast(row_stride); + + // GBRP -> NV12 via libswscale + SwsContext* swsCtx = sws_getContext( + vaFrame->width, vaFrame->height, AV_PIX_FMT_GBRP, + vaFrame->width, vaFrame->height, AV_PIX_FMT_NV12, + SWS_BILINEAR, nullptr, nullptr, nullptr); + TORCH_CHECK(swsCtx != nullptr, "sws_getContext(GBRP->NV12) failed"); + sws_scale( + swsCtx, + gbrpFrame->data, + gbrpFrame->linesize, + 0, + vaFrame->height, + cpuFrame->data, + cpuFrame->linesize); + sws_freeContext(swsCtx); + + // Upload CPU NV12 -> VAAPI surface + ret = av_hwframe_transfer_data(vaFrame.get(), cpuFrame.get(), 0); + TORCH_CHECK( + ret >= 0, + "av_hwframe_transfer_data (NV12->VAAPI) failed: ", + getFFMPEGErrorStringFromErrorCode(ret)); + + vaFrame->colorspace = codecContext->colorspace; + vaFrame->color_range = codecContext->color_range; + return vaFrame; +} + } // namespace facebook::torchcodec diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h index 71b7566..682a312 100644 --- a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h +++ b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h @@ -32,6 +32,15 @@ class XpuDeviceInterface : public DeviceInterface { void registerHardwareDeviceWithCodec(AVCodecContext* codecContext) override; + // ---- Encoding overrides ---- + UniqueAVFrame convertTensorToAVFrameForEncoding( + const torch::stable::Tensor& tensor, + int frameIndex, + AVCodecContext* codecContext) override; + + void setupHardwareFrameContextForEncoding( + AVCodecContext* codecContext) override; + void convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, FrameOutput& frameOutput, @@ -64,6 +73,20 @@ class XpuDeviceInterface : public DeviceInterface { void convertAVFrameToFrameOutput_FilterGraph( UniqueAVFrame& avFrame, torch::stable::Tensor& dst); + + // ---- Encoding helpers ---- + // SYCL path: exports VAAPI surface as DMA-BUF, imports via Level Zero USM, + // runs convertRGBToNV12 kernel directly on the surface memory. + UniqueAVFrame encodeConvert_SYCL( + const torch::stable::Tensor& tensor, + AVCodecContext* codecContext, + UniqueAVFrame vaFrame); + // CPU fallback: moves tensor to CPU, uses libswscale GBRP->NV12, + // then av_hwframe_transfer_data to upload into the VAAPI surface. + UniqueAVFrame encodeConvert_CPU( + const torch::stable::Tensor& tensor, + AVCodecContext* codecContext, + UniqueAVFrame vaFrame); }; } // namespace facebook::torchcodec From ad842da81745a42591441e5fd7ab58cb8d30a321 Mon Sep 17 00:00:00 2001 From: Edgar Romo Montiel Date: Mon, 25 May 2026 14:48:37 -0700 Subject: [PATCH 2/3] Update patch and constant for context sw_pixel conversion in XPU case. Signed-off-by: Edgar Romo Montiel --- ...-Add-XPU-encoding-support-to-Encoder.patch | 32 ++++--------------- .../src/torchcodec_xpu/XpuDeviceInterface.cpp | 8 ++--- 2 files changed, 9 insertions(+), 31 deletions(-) diff --git a/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch b/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch index 65be795..d11c40c 100644 --- a/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch +++ b/packages/torchcodec-xpu/patches/0002-Add-XPU-encoding-support-to-Encoder.patch @@ -2,17 +2,7 @@ diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp index 2fe136fb..d024834b 100644 --- a/src/torchcodec/_core/Encoder.cpp +++ b/src/torchcodec/_core/Encoder.cpp -@@ -787,7 +787,8 @@ void VideoEncoder::initializeEncoder( - // The default CUDA interface is decode-only; encoders need the FFmpeg-based - // one. - deviceInterface_ = createDeviceInterface( -- stableDevice, stableDevice.type() == kStableCUDA ? "ffmpeg" : "default"); -+ stableDevice, -+ stableDevice.type() == kStableCUDA ? "ffmpeg" : "default"); - const AVCodec* avCodec = nullptr; - // If codec arg is provided, find codec using logic similar to FFmpeg: - // https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835 -@@ -846,7 +847,8 @@ void VideoEncoder::initializeEncoder( +@@ -846,7 +846,8 @@ void VideoEncoder::initializeEncoder( if (videoStreamOptions.pixelFormat.has_value()) { // TODO-VideoEncoder: (P2) Enable pixel formats to be set by user on GPU // and handled with the appropriate NPP function on GPU. @@ -22,7 +12,7 @@ index 2fe136fb..d024834b 100644 STD_TORCH_CHECK( false, "Video encoding on GPU currently only supports the nv12 pixel format. " -@@ -855,7 +857,8 @@ void VideoEncoder::initializeEncoder( +@@ -855,7 +856,8 @@ void VideoEncoder::initializeEncoder( outPixelFormat = validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value()); } else { @@ -32,7 +22,7 @@ index 2fe136fb..d024834b 100644 // Default to nv12 pixel format when encoding on GPU. outPixelFormat = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT; } else { -@@ -910,7 +913,8 @@ void VideoEncoder::initializeEncoder( +@@ -910,7 +912,8 @@ void VideoEncoder::initializeEncoder( 0); } @@ -42,17 +32,7 @@ index 2fe136fb..d024834b 100644 deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get()); deviceInterface_->setupHardwareFrameContextForEncoding( avCodecContext_.get()); -@@ -1124,7 +1128,8 @@ void MultiStreamEncoder::addVideoStream( - // The default CUDA interface is decode-only; encoders need the FFmpeg-based - // one. - videoStream_->deviceInterface = createDeviceInterface( -- stableDevice, stableDevice.type() == kStableCUDA ? "ffmpeg" : "default"); -+ stableDevice, -+ stableDevice.type() == kStableCUDA ? "ffmpeg" : "default"); - videoStream_->inHeight = height; - videoStream_->inWidth = width; - videoStream_->inFrameRate = frameRate; -@@ -1208,7 +1213,7 @@ void MultiStreamEncoder::initializeVideoStream() { +@@ -1208,7 +1211,7 @@ void MultiStreamEncoder::initializeVideoStream() { if (videoStream.options.pixelFormat.has_value()) { // TODO-MultiStreamEncoder: (P2) Enable pixel formats to be set by user on // GPU and handled with the appropriate NPP function on GPU. @@ -61,7 +41,7 @@ index 2fe136fb..d024834b 100644 STD_TORCH_CHECK( false, "Video encoding on GPU currently only supports the nv12 pixel format. " -@@ -1217,7 +1222,7 @@ void MultiStreamEncoder::initializeVideoStream() { +@@ -1217,7 +1220,7 @@ void MultiStreamEncoder::initializeVideoStream() { outPixelFormat = validatePixelFormat(*avCodec, videoStream.options.pixelFormat.value()); } else { @@ -70,7 +50,7 @@ index 2fe136fb..d024834b 100644 // Default to nv12 pixel format when encoding on GPU. outPixelFormat = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT; } else { -@@ -1275,7 +1280,7 @@ void MultiStreamEncoder::initializeVideoStream() { +@@ -1275,7 +1278,7 @@ void MultiStreamEncoder::initializeVideoStream() { 0); } diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp index 69318cb..09b4b6f 100644 --- a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp +++ b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include @@ -573,9 +572,8 @@ std::optional XpuDeviceInterface::findCodec( // ============================================================ // Encoding: setupHardwareFrameContextForEncoding // ============================================================ -// Allocates and initializes a VAAPI hw_frames_ctx on the codec context. -// Mirrors the CUDA implementation in CudaDeviceInterface.cpp, with -// AV_PIX_FMT_CUDA -> AV_PIX_FMT_VAAPI as the only pixel format change. +// Allocates a VAAPI hw_frames_ctx on the codec context so the encoder +// can write directly into VAAPI surfaces (NV12 layout, VAAPI wrapper). void XpuDeviceInterface::setupHardwareFrameContextForEncoding( AVCodecContext* codecContext) { TORCH_CHECK( @@ -592,7 +590,7 @@ void XpuDeviceInterface::setupHardwareFrameContextForEncoding( // sw_pix_fmt: the software (CPU-accessible) format the encoder consumes inside the surface // pix_fmt: the hardware wrapper format the codec sees (must match hw_frames_ctx->format) - codecContext->sw_pix_fmt = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT; // AV_PIX_FMT_NV12 + codecContext->sw_pix_fmt = AV_PIX_FMT_NV12; codecContext->pix_fmt = AV_PIX_FMT_VAAPI; auto* hwFramesCtx = reinterpret_cast(hwFramesCtxRef->data); From 1ef834451d4f6e85bb1860b15e1013fb00769205 Mon Sep 17 00:00:00 2001 From: Edgar Romo Montiel Date: Tue, 26 May 2026 15:11:31 -0700 Subject: [PATCH 3/3] Update convolution matrix for color conversion for encoding How to reproduce FFmpeg RGB->YUV matrix values 1. Expose ff_fill_rgb2yuv_table in libavfilter/libavfilter.v: add "ff_fill_rgb2yuv_table;" under the global section. Example: libavfilter/libavfilter.v LIBAVFILTER_MAJOR { global: avfilter_*; av_*; + ff_fill_rgb2yuv_table; local: *; }; 2. Rebuild FFmpeg: cd ffmpeg && ./configure && make -j$(nproc) && make install nm -D /lib/libavfilter.so | grep ff_fill_rgb2yuv_table 3. Create rgb2yuv_test.c calling ff_fill_rgb2yuv_table(av_csp_luma_coeffs_from_avcsp(cs), m) for AVCOL_SPC_BT709, BT470BG. 4. Build: gcc rgb2yuv_test.c -o rgb2yuv_test \ -I/include -L/lib \ -lavfilter -lavutil -Wl,-rpath,/lib Signed-off-by: Edgar Romo Montiel --- .../src/torchcodec_xpu/ColorConversionKernel.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp b/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp index eea9efa..b789261 100644 --- a/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp +++ b/packages/torchcodec-xpu/src/torchcodec_xpu/ColorConversionKernel.cpp @@ -35,9 +35,9 @@ struct rgb_matrix { // ============================================================ struct yuv_matrix { static constexpr float3x3 BT709 = { - sycl::float3{ 0.2126f, 0.7152f, 0.0722f }, // Y - sycl::float3{ -0.1146f, -0.3854f, 0.5f }, // Cb - sycl::float3{ 0.5f, -0.4542f, -0.0458f } // Cr + sycl::float3{ 0.212600f, 0.715200f, 0.072200f }, // Y + sycl::float3{ -0.114572f, -0.385428f, 0.5f }, // Cb + sycl::float3{ 0.5f, -0.454153f, -0.045847f } // Cr }; static constexpr float3x3 BT601 = { sycl::float3{ 0.299f, 0.587f, 0.114f }, // Y