From 152593c5d440073907e4c3c6430f3c6ef2491990 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Sat, 9 May 2026 14:48:43 +0800 Subject: [PATCH 1/7] support gemma4 megatron --- .../Instruction/Supported-models-and-datasets.md | 16 ++++++++-------- .../Instruction/Supported-models-and-datasets.md | 16 ++++++++-------- swift/megatron/utils/convert_utils.py | 6 +++--- swift/model/models/gemma.py | 1 + 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/docs/source/Instruction/Supported-models-and-datasets.md b/docs/source/Instruction/Supported-models-and-datasets.md index 7115be751a..1b7369f945 100644 --- a/docs/source/Instruction/Supported-models-and-datasets.md +++ b/docs/source/Instruction/Supported-models-and-datasets.md @@ -1119,14 +1119,14 @@ |[google/gemma-3n-E4B](https://modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)| |[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)| |[google/gemma-3n-E4B-it](https://modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)| -|[google/gemma-4-E2B](https://modelscope.cn/models/google/gemma-4-E2B)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E2B](https://huggingface.co/google/gemma-4-E2B)| -|[google/gemma-4-E2B-it](https://modelscope.cn/models/google/gemma-4-E2B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E2B-it](https://huggingface.co/google/gemma-4-E2B-it)| -|[google/gemma-4-E4B](https://modelscope.cn/models/google/gemma-4-E4B)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E4B](https://huggingface.co/google/gemma-4-E4B)| -|[google/gemma-4-E4B-it](https://modelscope.cn/models/google/gemma-4-E4B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E4B-it](https://huggingface.co/google/gemma-4-E4B-it)| -|[google/gemma-4-31B](https://modelscope.cn/models/google/gemma-4-31B)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B)| -|[google/gemma-4-31B-it](https://modelscope.cn/models/google/gemma-4-31B-it)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-31B-it](https://huggingface.co/google/gemma-4-31B-it)| -|[google/gemma-4-26B-A4B](https://modelscope.cn/models/google/gemma-4-26B-A4B)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-26B-A4B](https://huggingface.co/google/gemma-4-26B-A4B)| -|[google/gemma-4-26B-A4B-it](https://modelscope.cn/models/google/gemma-4-26B-A4B-it)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-26B-A4B-it](https://huggingface.co/google/gemma-4-26B-A4B-it)| +|[google/gemma-4-E2B](https://modelscope.cn/models/google/gemma-4-E2B)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E2B](https://huggingface.co/google/gemma-4-E2B)| +|[google/gemma-4-E2B-it](https://modelscope.cn/models/google/gemma-4-E2B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E2B-it](https://huggingface.co/google/gemma-4-E2B-it)| +|[google/gemma-4-E4B](https://modelscope.cn/models/google/gemma-4-E4B)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E4B](https://huggingface.co/google/gemma-4-E4B)| +|[google/gemma-4-E4B-it](https://modelscope.cn/models/google/gemma-4-E4B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E4B-it](https://huggingface.co/google/gemma-4-E4B-it)| +|[google/gemma-4-31B](https://modelscope.cn/models/google/gemma-4-31B)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B)| +|[google/gemma-4-31B-it](https://modelscope.cn/models/google/gemma-4-31B-it)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-31B-it](https://huggingface.co/google/gemma-4-31B-it)| +|[google/gemma-4-26B-A4B](https://modelscope.cn/models/google/gemma-4-26B-A4B)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-26B-A4B](https://huggingface.co/google/gemma-4-26B-A4B)| +|[google/gemma-4-26B-A4B-it](https://modelscope.cn/models/google/gemma-4-26B-A4B-it)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-26B-A4B-it](https://huggingface.co/google/gemma-4-26B-A4B-it)| |[mistralai/Mistral-Small-3.1-24B-Base-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Base-2503)|mistral3|mistral_2503|transformers>=4.49|✘|vision|[mistralai/Mistral-Small-3.1-24B-Base-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503)| |[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|mistral3|mistral_2503|transformers>=4.49|✘|vision|[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)| |[mistralai/Ministral-3-3B-Base-2512](https://modelscope.cn/models/mistralai/Ministral-3-3B-Base-2512)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-3B-Base-2512](https://huggingface.co/mistralai/Ministral-3-3B-Base-2512)| diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index c70e7b7aaa..4431217b5e 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -1120,14 +1120,14 @@ The table below introduces the models integrated with ms-swift: |[google/gemma-3n-E4B](https://modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)| |[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)| |[google/gemma-3n-E4B-it](https://modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)| -|[google/gemma-4-E2B](https://modelscope.cn/models/google/gemma-4-E2B)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E2B](https://huggingface.co/google/gemma-4-E2B)| -|[google/gemma-4-E2B-it](https://modelscope.cn/models/google/gemma-4-E2B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E2B-it](https://huggingface.co/google/gemma-4-E2B-it)| -|[google/gemma-4-E4B](https://modelscope.cn/models/google/gemma-4-E4B)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E4B](https://huggingface.co/google/gemma-4-E4B)| -|[google/gemma-4-E4B-it](https://modelscope.cn/models/google/gemma-4-E4B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E4B-it](https://huggingface.co/google/gemma-4-E4B-it)| -|[google/gemma-4-31B](https://modelscope.cn/models/google/gemma-4-31B)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B)| -|[google/gemma-4-31B-it](https://modelscope.cn/models/google/gemma-4-31B-it)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-31B-it](https://huggingface.co/google/gemma-4-31B-it)| -|[google/gemma-4-26B-A4B](https://modelscope.cn/models/google/gemma-4-26B-A4B)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-26B-A4B](https://huggingface.co/google/gemma-4-26B-A4B)| -|[google/gemma-4-26B-A4B-it](https://modelscope.cn/models/google/gemma-4-26B-A4B-it)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-26B-A4B-it](https://huggingface.co/google/gemma-4-26B-A4B-it)| +|[google/gemma-4-E2B](https://modelscope.cn/models/google/gemma-4-E2B)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E2B](https://huggingface.co/google/gemma-4-E2B)| +|[google/gemma-4-E2B-it](https://modelscope.cn/models/google/gemma-4-E2B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E2B-it](https://huggingface.co/google/gemma-4-E2B-it)| +|[google/gemma-4-E4B](https://modelscope.cn/models/google/gemma-4-E4B)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E4B](https://huggingface.co/google/gemma-4-E4B)| +|[google/gemma-4-E4B-it](https://modelscope.cn/models/google/gemma-4-E4B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E4B-it](https://huggingface.co/google/gemma-4-E4B-it)| +|[google/gemma-4-31B](https://modelscope.cn/models/google/gemma-4-31B)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B)| +|[google/gemma-4-31B-it](https://modelscope.cn/models/google/gemma-4-31B-it)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-31B-it](https://huggingface.co/google/gemma-4-31B-it)| +|[google/gemma-4-26B-A4B](https://modelscope.cn/models/google/gemma-4-26B-A4B)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-26B-A4B](https://huggingface.co/google/gemma-4-26B-A4B)| +|[google/gemma-4-26B-A4B-it](https://modelscope.cn/models/google/gemma-4-26B-A4B-it)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-26B-A4B-it](https://huggingface.co/google/gemma-4-26B-A4B-it)| |[mistralai/Mistral-Small-3.1-24B-Base-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Base-2503)|mistral3|mistral_2503|transformers>=4.49|✘|vision|[mistralai/Mistral-Small-3.1-24B-Base-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503)| |[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|mistral3|mistral_2503|transformers>=4.49|✘|vision|[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)| |[mistralai/Ministral-3-3B-Base-2512](https://modelscope.cn/models/mistralai/Ministral-3-3B-Base-2512)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-3B-Base-2512](https://huggingface.co/mistralai/Ministral-3-3B-Base-2512)| diff --git a/swift/megatron/utils/convert_utils.py b/swift/megatron/utils/convert_utils.py index e10c5cf3ce..f95c260f59 100644 --- a/swift/megatron/utils/convert_utils.py +++ b/swift/megatron/utils/convert_utils.py @@ -66,9 +66,9 @@ def _model_cpu_forward_context(modules, origin_torch_dtype = next(modules[0].parameters()).dtype except StopIteration: origin_torch_dtype = next(modules[-1].parameters()).dtype - embedding = None + embeddings = None if share_embedding: - embedding = [module for module in modules if isinstance(module, (nn.Embedding, VocabParallelEmbedding))][-1] + embeddings = [module for module in modules if isinstance(module, (nn.Embedding, VocabParallelEmbedding))] def _to_cuda_hook(module, args): if compute_device is not None or torch_dtype is not None: @@ -77,7 +77,7 @@ def _to_cuda_hook(module, args): return args def _to_cpu_hook(module, args, output): - if share_embedding and module is embedding: + if share_embedding and module in embeddings: return module.to(device=target_device, dtype=origin_torch_dtype) diff --git a/swift/model/models/gemma.py b/swift/model/models/gemma.py index a1b6e3a9dc..4d1eedf357 100644 --- a/swift/model/models/gemma.py +++ b/swift/model/models/gemma.py @@ -263,6 +263,7 @@ def forward( if self.config.get_text_config().hidden_size_per_layer_input: pad_embedding = self.language_model.embed_tokens.weight[self.config.text_config.pad_token_id, :] + pad_embedding = pad_embedding.to(multimodal_mask.device) llm_inputs_embeds = torch.where(multimodal_mask[..., None], pad_embedding.view(1, 1, -1), inputs_embeds) per_layer_inputs = self.language_model.get_per_layer_inputs(llm_input_ids, llm_inputs_embeds) else: From d7c2f289d728f4387b5712fe507e8ad0b2e834e6 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 19 May 2026 02:00:26 +0800 Subject: [PATCH 2/7] update --- swift/megatron/utils/convert_utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/swift/megatron/utils/convert_utils.py b/swift/megatron/utils/convert_utils.py index 4b487737a6..7b589b84ed 100644 --- a/swift/megatron/utils/convert_utils.py +++ b/swift/megatron/utils/convert_utils.py @@ -62,10 +62,13 @@ def _model_cpu_forward_context(modules, compute_device=None, share_embedding: bool = False, target_device='cpu'): - try: - origin_torch_dtype = next(modules[0].parameters()).dtype - except StopIteration: - origin_torch_dtype = next(modules[-1].parameters()).dtype + for module in modules: + try: + origin_torch_dtype = next(module.parameters()).dtype + except StopIteration: + pass + else: + break embeddings = None if share_embedding: embeddings = [module for module in modules if isinstance(module, (nn.Embedding, VocabParallelEmbedding))] @@ -77,7 +80,7 @@ def _to_cuda_hook(module, args): return args def _to_cpu_hook(module, args, output): - if share_embedding and module in embeddings: + if share_embedding and module in embeddings or 'rotaryemb' in module.__class__.__name__.lower(): return module.to(device=target_device, dtype=origin_torch_dtype) From 96ff1662beef0a4345270fa633cdac76a7e5c46b Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 19 May 2026 15:57:08 +0800 Subject: [PATCH 3/7] update --- examples/models/gemma4/megatron.sh | 50 ++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 examples/models/gemma4/megatron.sh diff --git a/examples/models/gemma4/megatron.sh b/examples/models/gemma4/megatron.sh new file mode 100644 index 0000000000..df9f4c6f30 --- /dev/null +++ b/examples/models/gemma4/megatron.sh @@ -0,0 +1,50 @@ +# 8 * 80GiB +PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \ +NPROC_PER_NODE=8 \ +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +megatron sft \ + --model google/gemma-4-26B-A4B-it \ + --save_safetensors true \ + --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ + 'AI-ModelScope/alpaca-gpt4-data-en#500' \ + 'swift/self-cognition#500' \ + 'AI-ModelScope/LaTeX_OCR:human_handwrite#2000' \ + --load_from_cache_file true \ + --add_non_thinking_prefix true \ + --split_dataset_ratio 0.01 \ + --tuner_type full \ + --tensor_model_parallel_size 2 \ + --expert_model_parallel_size 4 \ + --pipeline_model_parallel_size 2 \ + --moe_permute_fusion true \ + --moe_grouped_gemm true \ + --moe_shared_expert_overlap true \ + --moe_aux_loss_coeff 1e-6 \ + --micro_batch_size 4 \ + --global_batch_size 16 \ + --recompute_granularity full \ + --recompute_method uniform \ + --recompute_num_layers 1 \ + --num_train_epochs 1 \ + --finetune true \ + --freeze_llm false \ + --freeze_vit true \ + --freeze_aligner true \ + --cross_entropy_loss_fusion true \ + --lr 1e-5 \ + --lr_warmup_fraction 0.05 \ + --min_lr 1e-6 \ + --output_dir megatron_output/gemma-4-26B-A4B-it \ + --eval_steps 200 \ + --save_steps 200 \ + --max_length 4096 \ + --dataloader_num_workers 8 \ + --dataset_num_proc 8 \ + --no_save_optim true \ + --no_save_rng true \ + --sequence_parallel true \ + --attention_backend unfused \ + --group_by_length true \ + --padding_free false \ + --model_author swift \ + --model_name swift-robot From fd783fc610d78238a0914d3e72cc1860c10c6d5a Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 19 May 2026 16:00:45 +0800 Subject: [PATCH 4/7] update --- examples/models/gemma4/megatron.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/gemma4/megatron.sh b/examples/models/gemma4/megatron.sh index df9f4c6f30..4458b3b762 100644 --- a/examples/models/gemma4/megatron.sh +++ b/examples/models/gemma4/megatron.sh @@ -20,7 +20,7 @@ megatron sft \ --moe_grouped_gemm true \ --moe_shared_expert_overlap true \ --moe_aux_loss_coeff 1e-6 \ - --micro_batch_size 4 \ + --micro_batch_size 8 \ --global_batch_size 16 \ --recompute_granularity full \ --recompute_method uniform \ From c8aa1c140f198114d0c38b337114ba9522fb10ef Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 19 May 2026 16:02:08 +0800 Subject: [PATCH 5/7] update --- examples/models/gemma4/megatron.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/models/gemma4/megatron.sh b/examples/models/gemma4/megatron.sh index 4458b3b762..b301e59e78 100644 --- a/examples/models/gemma4/megatron.sh +++ b/examples/models/gemma4/megatron.sh @@ -1,4 +1,6 @@ # 8 * 80GiB +# Due to the use of group_by_length, the data is not sufficiently shuffled, +# which may cause fluctuations in the loss curve. Please adjust the parameters accordingly. PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \ NPROC_PER_NODE=8 \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ From 845ad34b5cfa1bbb0cab152845b79bbbdccc2d3d Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 19 May 2026 16:09:23 +0800 Subject: [PATCH 6/7] update --- examples/models/gemma4/megatron.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/models/gemma4/megatron.sh b/examples/models/gemma4/megatron.sh index b301e59e78..97294e1ea8 100644 --- a/examples/models/gemma4/megatron.sh +++ b/examples/models/gemma4/megatron.sh @@ -37,8 +37,8 @@ megatron sft \ --lr_warmup_fraction 0.05 \ --min_lr 1e-6 \ --output_dir megatron_output/gemma-4-26B-A4B-it \ - --eval_steps 200 \ - --save_steps 200 \ + --eval_steps 500 \ + --save_steps 500 \ --max_length 4096 \ --dataloader_num_workers 8 \ --dataset_num_proc 8 \ @@ -50,3 +50,10 @@ megatron sft \ --padding_free false \ --model_author swift \ --model_name swift-robot + +# CUDA_VISIBLE_DEVICES=0 swift infer \ +# --model megatron_output/gemma-4-26B-A4B-it/vx-xxx/checkpoint-xxx \ +# --stream true \ +# --enable_thinking false \ +# --load_data_args true \ +# --max_new_tokens 2048 From 72a258dc998fbe811d74a6d0f3da99ec41b3bb51 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 19 May 2026 16:48:09 +0800 Subject: [PATCH 7/7] update --- examples/models/gemma4/{megatron.sh => mcore.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/models/gemma4/{megatron.sh => mcore.sh} (100%) diff --git a/examples/models/gemma4/megatron.sh b/examples/models/gemma4/mcore.sh similarity index 100% rename from examples/models/gemma4/megatron.sh rename to examples/models/gemma4/mcore.sh