From 2c3766acaa4150ea027f0ab73596b2cb3752fb2b Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Wed, 13 May 2026 16:29:44 +0800 Subject: [PATCH 1/6] support megatron fp4 --- docs/source/Megatron-SWIFT/Command-line-parameters.md | 10 ++++++++-- .../Megatron-SWIFT/Command-line-parameters.md | 10 ++++++++-- swift/megatron/arguments/megatron_args.py | 7 ++++++- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/docs/source/Megatron-SWIFT/Command-line-parameters.md b/docs/source/Megatron-SWIFT/Command-line-parameters.md index aa618049b0..2f7a344c2e 100644 --- a/docs/source/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source/Megatron-SWIFT/Command-line-parameters.md @@ -173,10 +173,16 @@ **fp8参数**: - fp8_format: 用于前向和反向传播中FP8张量的FP8格式方案。可选为'e4m3','hybrid'。默认为None。 - fp8_recipe: 用于前向和反向传播中 FP8 张量的 FP8 算法方案。可选为'tensorwise', 'delayed', 'mxfp8', 'blockwise'。默认为'delayed'。其中blockwise fp8需要 cuda129 以上版本。 -- fp8_amax_history_len: 每个张量记录 amax 历史的步数。默认为1024。 -- fp8_amax_compute_algo: 用于根据历史记录计算 amax 的算法。可选为'most_recent', 'max'。默认为'max'。 - fp8_param_gather: 保持计算参数为 fp8(不使用任何其他中间数据类型),并在 fp8 格式下执行参数的 all-gather 操作。默认为False。 - 提示:若想导出FP8权重格式,设置为True;否则设置为False。 +- fp8_amax_history_len: 每个张量记录 amax 历史的步数。默认为1024。 +- fp8_amax_compute_algo: 用于根据历史记录计算 amax 的算法。可选为'most_recent', 'max'。默认为'max'。 + +**fp4参数**: +- fp4_format: 用于前向和反向传播中FP8张量的FP4格式方案,可选为'e2m1'。默认为None。 +- fp4_recipe: 若设置此参数,则通过 Transformer Engine 启用 FP4 精度。目前仅支持 'nvfp4',该选项使用适用于 Blackwell+ 架构的 NVFP4BlockScaling 方案。 +- fp4_param_gather: 若设置此参数,则将参数保持为 FP4 精度以节省内存。注意并非所有参数都会被转换为 FP4,例如偏置项将保持不变。 + **混合精度参数**: - fp16: fp16模式。默认为None,会根据模型的torch_dtype进行设置,即torch_dtype为float16或者float32则fp16设置为True。torch_dtype默认读取config.json。 diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md index c82d008a19..03a785f694 100644 --- a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md @@ -182,10 +182,16 @@ For guidance on selecting parallelization strategies, please refer to the [Train **FP8 Parameters**: - fp8_format: The FP8 format scheme used for FP8 tensors in the forward and backward pass. Options are 'e4m3' and 'hybrid'. Default is None. - fp8_recipe: The FP8 recipe (algorithm scheme) used for FP8 tensors in the forward and backward pass. Options are 'tensorwise', 'delayed', 'mxfp8', and 'blockwise'. Default is 'delayed'. Note that blockwise fp8 requires CUDA version 12.9 or higher. -- fp8_amax_history_len: Number of steps for which amax history is recorded per tensor. Default is 1024. -- fp8_amax_compute_algo: Algorithm for computing amax from history. Options are 'most_recent' and 'max'. Default is 'max'. - fp8_param_gather: Keep the compute parameter in FP8 (do not use any other intermediate dtype) and perform the parameter all-gather in FP8 format. Default is False. - Tips: Set this to True if you want to export weights in FP8 format; otherwise, set it to False. +- fp8_amax_history_len: Number of steps for which amax history is recorded per tensor. Default is 1024. +- fp8_amax_compute_algo: Algorithm for computing amax from history. Options are 'most_recent' and 'max'. Default is 'max'. + +**fp4 Parameters**: + +- `fp4_format`: The FP4 format scheme for FP8 tensors in forward and backward passes, optionally set to `'e2m1'`. Defaults to `None`. +- `fp4_recipe`: If set, enables FP4 precision through Transformer Engine. Currently only `'nvfp4'` is supported, which uses the NVFP4BlockScaling recipe for Blackwell+ architecture. +- `fp4_param_gather`: If set, keeps the parameters in FP4 precision to save memory. Note that not all parameters will be converted to FP4; for example, biases will remain unchanged. **Mixed Precision Parameters**: diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py index 8fc1cfc1b3..8d2bd65a39 100644 --- a/swift/megatron/arguments/megatron_args.py +++ b/swift/megatron/arguments/megatron_args.py @@ -524,9 +524,14 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin): # fp8 fp8_format: Literal['e4m3', 'hybrid'] = None fp8_recipe: Literal['tensorwise', 'delayed', 'mxfp8', 'blockwise'] = 'delayed' + fp8_param_gather: bool = False fp8_amax_history_len: int = 1024 fp8_amax_compute_algo: Literal['most_recent', 'max'] = 'max' - fp8_param_gather: bool = False + + # fp4 + fp4_format: Literal['e2m1'] = None + fp4_recipe: Literal['nvfp4'] = 'nvfp4' + fp4_param_gather: bool = False # mixed precision fp16: Optional[bool] = None From c0d700f24ed34e8f6860af28cbc365d08ffd9e28 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Wed, 13 May 2026 16:38:00 +0800 Subject: [PATCH 2/6] fix --- docs/source/Megatron-SWIFT/Command-line-parameters.md | 2 +- docs/source_en/Megatron-SWIFT/Command-line-parameters.md | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/source/Megatron-SWIFT/Command-line-parameters.md b/docs/source/Megatron-SWIFT/Command-line-parameters.md index 2f7a344c2e..6a61231df8 100644 --- a/docs/source/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source/Megatron-SWIFT/Command-line-parameters.md @@ -179,7 +179,7 @@ - fp8_amax_compute_algo: 用于根据历史记录计算 amax 的算法。可选为'most_recent', 'max'。默认为'max'。 **fp4参数**: -- fp4_format: 用于前向和反向传播中FP8张量的FP4格式方案,可选为'e2m1'。默认为None。 +- fp4_format: 用于前向和反向传播中FP4张量的FP4格式方案,可选为'e2m1'。默认为None。 - fp4_recipe: 若设置此参数,则通过 Transformer Engine 启用 FP4 精度。目前仅支持 'nvfp4',该选项使用适用于 Blackwell+ 架构的 NVFP4BlockScaling 方案。 - fp4_param_gather: 若设置此参数,则将参数保持为 FP4 精度以节省内存。注意并非所有参数都会被转换为 FP4,例如偏置项将保持不变。 diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md index 03a785f694..d3cd57ebe8 100644 --- a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md @@ -187,12 +187,11 @@ For guidance on selecting parallelization strategies, please refer to the [Train - fp8_amax_history_len: Number of steps for which amax history is recorded per tensor. Default is 1024. - fp8_amax_compute_algo: Algorithm for computing amax from history. Options are 'most_recent' and 'max'. Default is 'max'. -**fp4 Parameters**: - -- `fp4_format`: The FP4 format scheme for FP8 tensors in forward and backward passes, optionally set to `'e2m1'`. Defaults to `None`. -- `fp4_recipe`: If set, enables FP4 precision through Transformer Engine. Currently only `'nvfp4'` is supported, which uses the NVFP4BlockScaling recipe for Blackwell+ architecture. -- `fp4_param_gather`: If set, keeps the parameters in FP4 precision to save memory. Note that not all parameters will be converted to FP4; for example, biases will remain unchanged. +**FP4 Parameters**: +- fp4_format: The FP4 format scheme for FP4 tensors in forward and backward passes, optionally set to 'e2m1'. Defaults to None. +- fp4_recipe: If set, enables FP4 precision through Transformer Engine. Currently only 'nvfp4' is supported, which uses the NVFP4BlockScaling recipe for Blackwell+ architecture. +- fp4_param_gather: If set, keeps the parameters in FP4 precision to save memory. Note that not all parameters will be converted to FP4; for example, biases will remain unchanged. **Mixed Precision Parameters**: From cd53d7feb45dff969f8d5da488194afbada6e22a Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Wed, 13 May 2026 17:22:12 +0800 Subject: [PATCH 3/6] update --- swift/megatron/arguments/megatron_args.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py index 8d2bd65a39..ed1a497182 100644 --- a/swift/megatron/arguments/megatron_args.py +++ b/swift/megatron/arguments/megatron_args.py @@ -697,7 +697,10 @@ def __post_init__(self): or self.decoder_last_pipeline_num_layers is not None): raise ValueError('pipeline_model_parallel_size must be greater than 1 if you want to set ' 'decoder_first_pipeline_num_layers or decoder_last_pipeline_num_layers.') - self.fp8 = self.fp8_format # compat megatron-lm + # compat megatron-lm + self.fp8 = self.fp8_format + self.fp4 = self.fp4_format + if self.task_type not in {'causal_lm', 'generative_reranker'}: self.untie_embeddings_and_output_weights = True if self.vit_gradient_checkpointing_kwargs is not None: From 0de71a030b2cf6c2080bd0372a59bf7df94d88c1 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 19 May 2026 01:58:22 +0800 Subject: [PATCH 4/6] update --- swift/megatron/utils/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/swift/megatron/utils/utils.py b/swift/megatron/utils/utils.py index 59a276b498..d387eef42b 100644 --- a/swift/megatron/utils/utils.py +++ b/swift/megatron/utils/utils.py @@ -220,10 +220,11 @@ def get_padding_to(args): padding_to = (padding_to or 1) * args.context_parallel_size origin_padding_to = padding_to fp8_format = getattr(args, 'fp8_format', None) or getattr(args, 'fp8', None) + fp4_format = getattr(args, 'fp4_format', None) or getattr(args, 'fp4', None) if args.fp8_recipe == 'blockwise': padding_to = (padding_to or 1) * 128 - elif fp8_format is not None: - padding_to = max((padding_to or 1) * 8, 16) + elif fp8_format is not None or fp4_format is not None: + padding_to = (padding_to or 1) * 16 if args.attention_backend == 'fused': padding_to = max(padding_to or 1, ((origin_padding_to) or 1) * 64) return padding_to From 9f28323a4f3e7a441c7e36c7eafab8cbf753e8e9 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 19 May 2026 17:34:07 +0800 Subject: [PATCH 5/6] fix --- swift/megatron/arguments/megatron_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py index ed1e88f533..01f59d628a 100644 --- a/swift/megatron/arguments/megatron_args.py +++ b/swift/megatron/arguments/megatron_args.py @@ -705,7 +705,7 @@ def __post_init__(self): or self.decoder_last_pipeline_num_layers is not None): raise ValueError('pipeline_model_parallel_size must be greater than 1 if you want to set ' 'decoder_first_pipeline_num_layers or decoder_last_pipeline_num_layers.') - # compat megatron-lm + # compat megatron-core self.fp8 = self.fp8_format self.fp4 = self.fp4_format From 7c7323b26fa5f3d0f3ebe5d6f83faef4800e4e75 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 19 May 2026 17:38:12 +0800 Subject: [PATCH 6/6] update --- docs/source/Megatron-SWIFT/Command-line-parameters.md | 4 ++-- docs/source_en/Megatron-SWIFT/Command-line-parameters.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/Megatron-SWIFT/Command-line-parameters.md b/docs/source/Megatron-SWIFT/Command-line-parameters.md index 0ce91edc30..d3ebbd5c43 100644 --- a/docs/source/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source/Megatron-SWIFT/Command-line-parameters.md @@ -181,8 +181,8 @@ **fp4参数**: - fp4_format: 用于前向和反向传播中FP4张量的FP4格式方案,可选为'e2m1'。默认为None。 -- fp4_recipe: 若设置此参数,则通过 Transformer Engine 启用 FP4 精度。目前仅支持 'nvfp4',该选项使用适用于 Blackwell+ 架构的 NVFP4BlockScaling 方案。 -- fp4_param_gather: 若设置此参数,则将参数保持为 FP4 精度以节省内存。注意并非所有参数都会被转换为 FP4,例如偏置项将保持不变。 +- fp4_recipe: 若设置此参数,则通过 Transformer Engine 启用 FP4 精度。目前仅支持 'nvfp4',该选项使用适用于 Blackwell+ 架构的 NVFP4BlockScaling 方案。默认为'nvfp4'。 +- fp4_param_gather: 若设置此参数,则将参数保持为 FP4 精度以节省内存。注意并非所有参数都会被转换为 FP4,例如偏置项将保持不变。默认为False。 **混合精度参数**: diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md index 2a8d177f9f..f6263d5356 100644 --- a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md @@ -191,8 +191,8 @@ For guidance on selecting parallelization strategies, please refer to the [Train **FP4 Parameters**: - fp4_format: The FP4 format scheme for FP4 tensors in forward and backward passes, optionally set to 'e2m1'. Defaults to None. -- fp4_recipe: If set, enables FP4 precision through Transformer Engine. Currently only 'nvfp4' is supported, which uses the NVFP4BlockScaling recipe for Blackwell+ architecture. -- fp4_param_gather: If set, keeps the parameters in FP4 precision to save memory. Note that not all parameters will be converted to FP4; for example, biases will remain unchanged. +- fp4_recipe: If set, enables FP4 precision through Transformer Engine. Currently only 'nvfp4' is supported, which uses the NVFP4BlockScaling recipe for Blackwell+ architecture. Default is 'nvfp4'. +- fp4_param_gather: If set, keeps the parameters in FP4 precision to save memory. Note that not all parameters will be converted to FP4; for example, biases will remain unchanged. Default is False. **Mixed Precision Parameters**: