diff --git a/docs/source/Instruction/Supported-models-and-datasets.md b/docs/source/Instruction/Supported-models-and-datasets.md index 27e847a8fa..e26b7da0ad 100644 --- a/docs/source/Instruction/Supported-models-and-datasets.md +++ b/docs/source/Instruction/Supported-models-and-datasets.md @@ -1125,14 +1125,14 @@ |[google/gemma-3n-E4B](https://modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)| |[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)| |[google/gemma-3n-E4B-it](https://modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)| -|[google/gemma-4-E2B](https://modelscope.cn/models/google/gemma-4-E2B)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E2B](https://huggingface.co/google/gemma-4-E2B)| -|[google/gemma-4-E2B-it](https://modelscope.cn/models/google/gemma-4-E2B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E2B-it](https://huggingface.co/google/gemma-4-E2B-it)| -|[google/gemma-4-E4B](https://modelscope.cn/models/google/gemma-4-E4B)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E4B](https://huggingface.co/google/gemma-4-E4B)| -|[google/gemma-4-E4B-it](https://modelscope.cn/models/google/gemma-4-E4B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E4B-it](https://huggingface.co/google/gemma-4-E4B-it)| -|[google/gemma-4-31B](https://modelscope.cn/models/google/gemma-4-31B)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B)| -|[google/gemma-4-31B-it](https://modelscope.cn/models/google/gemma-4-31B-it)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-31B-it](https://huggingface.co/google/gemma-4-31B-it)| -|[google/gemma-4-26B-A4B](https://modelscope.cn/models/google/gemma-4-26B-A4B)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-26B-A4B](https://huggingface.co/google/gemma-4-26B-A4B)| -|[google/gemma-4-26B-A4B-it](https://modelscope.cn/models/google/gemma-4-26B-A4B-it)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-26B-A4B-it](https://huggingface.co/google/gemma-4-26B-A4B-it)| +|[google/gemma-4-E2B](https://modelscope.cn/models/google/gemma-4-E2B)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E2B](https://huggingface.co/google/gemma-4-E2B)| +|[google/gemma-4-E2B-it](https://modelscope.cn/models/google/gemma-4-E2B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E2B-it](https://huggingface.co/google/gemma-4-E2B-it)| +|[google/gemma-4-E4B](https://modelscope.cn/models/google/gemma-4-E4B)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E4B](https://huggingface.co/google/gemma-4-E4B)| +|[google/gemma-4-E4B-it](https://modelscope.cn/models/google/gemma-4-E4B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E4B-it](https://huggingface.co/google/gemma-4-E4B-it)| +|[google/gemma-4-31B](https://modelscope.cn/models/google/gemma-4-31B)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B)| +|[google/gemma-4-31B-it](https://modelscope.cn/models/google/gemma-4-31B-it)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-31B-it](https://huggingface.co/google/gemma-4-31B-it)| +|[google/gemma-4-26B-A4B](https://modelscope.cn/models/google/gemma-4-26B-A4B)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-26B-A4B](https://huggingface.co/google/gemma-4-26B-A4B)| +|[google/gemma-4-26B-A4B-it](https://modelscope.cn/models/google/gemma-4-26B-A4B-it)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-26B-A4B-it](https://huggingface.co/google/gemma-4-26B-A4B-it)| |[mistralai/Mistral-Small-3.1-24B-Base-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Base-2503)|mistral3|mistral_2503|transformers>=4.49|✘|vision|[mistralai/Mistral-Small-3.1-24B-Base-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503)| |[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|mistral3|mistral_2503|transformers>=4.49|✘|vision|[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)| |[mistralai/Ministral-3-3B-Base-2512](https://modelscope.cn/models/mistralai/Ministral-3-3B-Base-2512)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-3B-Base-2512](https://huggingface.co/mistralai/Ministral-3-3B-Base-2512)| diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index 19be0f9d53..de6dfd02c1 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -1126,14 +1126,14 @@ The table below introduces the models integrated with ms-swift: |[google/gemma-3n-E4B](https://modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)| |[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)| |[google/gemma-3n-E4B-it](https://modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)| -|[google/gemma-4-E2B](https://modelscope.cn/models/google/gemma-4-E2B)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E2B](https://huggingface.co/google/gemma-4-E2B)| -|[google/gemma-4-E2B-it](https://modelscope.cn/models/google/gemma-4-E2B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E2B-it](https://huggingface.co/google/gemma-4-E2B-it)| -|[google/gemma-4-E4B](https://modelscope.cn/models/google/gemma-4-E4B)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E4B](https://huggingface.co/google/gemma-4-E4B)| -|[google/gemma-4-E4B-it](https://modelscope.cn/models/google/gemma-4-E4B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✘|-|[google/gemma-4-E4B-it](https://huggingface.co/google/gemma-4-E4B-it)| -|[google/gemma-4-31B](https://modelscope.cn/models/google/gemma-4-31B)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B)| -|[google/gemma-4-31B-it](https://modelscope.cn/models/google/gemma-4-31B-it)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-31B-it](https://huggingface.co/google/gemma-4-31B-it)| -|[google/gemma-4-26B-A4B](https://modelscope.cn/models/google/gemma-4-26B-A4B)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-26B-A4B](https://huggingface.co/google/gemma-4-26B-A4B)| -|[google/gemma-4-26B-A4B-it](https://modelscope.cn/models/google/gemma-4-26B-A4B-it)|gemma4|gemma4|transformers>=4.53|✘|-|[google/gemma-4-26B-A4B-it](https://huggingface.co/google/gemma-4-26B-A4B-it)| +|[google/gemma-4-E2B](https://modelscope.cn/models/google/gemma-4-E2B)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E2B](https://huggingface.co/google/gemma-4-E2B)| +|[google/gemma-4-E2B-it](https://modelscope.cn/models/google/gemma-4-E2B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E2B-it](https://huggingface.co/google/gemma-4-E2B-it)| +|[google/gemma-4-E4B](https://modelscope.cn/models/google/gemma-4-E4B)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E4B](https://huggingface.co/google/gemma-4-E4B)| +|[google/gemma-4-E4B-it](https://modelscope.cn/models/google/gemma-4-E4B-it)|gemma4|gemma4_nothinking|transformers>=4.53|✔|-|[google/gemma-4-E4B-it](https://huggingface.co/google/gemma-4-E4B-it)| +|[google/gemma-4-31B](https://modelscope.cn/models/google/gemma-4-31B)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B)| +|[google/gemma-4-31B-it](https://modelscope.cn/models/google/gemma-4-31B-it)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-31B-it](https://huggingface.co/google/gemma-4-31B-it)| +|[google/gemma-4-26B-A4B](https://modelscope.cn/models/google/gemma-4-26B-A4B)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-26B-A4B](https://huggingface.co/google/gemma-4-26B-A4B)| +|[google/gemma-4-26B-A4B-it](https://modelscope.cn/models/google/gemma-4-26B-A4B-it)|gemma4|gemma4|transformers>=4.53|✔|-|[google/gemma-4-26B-A4B-it](https://huggingface.co/google/gemma-4-26B-A4B-it)| |[mistralai/Mistral-Small-3.1-24B-Base-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Base-2503)|mistral3|mistral_2503|transformers>=4.49|✘|vision|[mistralai/Mistral-Small-3.1-24B-Base-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503)| |[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|mistral3|mistral_2503|transformers>=4.49|✘|vision|[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)| |[mistralai/Ministral-3-3B-Base-2512](https://modelscope.cn/models/mistralai/Ministral-3-3B-Base-2512)|mistral3|mistral_2512|transformers>=5.0.0.dev0, mistral-common>=1.8.6|✘|vision|[mistralai/Ministral-3-3B-Base-2512](https://huggingface.co/mistralai/Ministral-3-3B-Base-2512)| diff --git a/examples/models/gemma4/mcore.sh b/examples/models/gemma4/mcore.sh new file mode 100644 index 0000000000..97294e1ea8 --- /dev/null +++ b/examples/models/gemma4/mcore.sh @@ -0,0 +1,59 @@ +# 8 * 80GiB +# Due to the use of group_by_length, the data is not sufficiently shuffled, +# which may cause fluctuations in the loss curve. Please adjust the parameters accordingly. +PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \ +NPROC_PER_NODE=8 \ +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +megatron sft \ + --model google/gemma-4-26B-A4B-it \ + --save_safetensors true \ + --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ + 'AI-ModelScope/alpaca-gpt4-data-en#500' \ + 'swift/self-cognition#500' \ + 'AI-ModelScope/LaTeX_OCR:human_handwrite#2000' \ + --load_from_cache_file true \ + --add_non_thinking_prefix true \ + --split_dataset_ratio 0.01 \ + --tuner_type full \ + --tensor_model_parallel_size 2 \ + --expert_model_parallel_size 4 \ + --pipeline_model_parallel_size 2 \ + --moe_permute_fusion true \ + --moe_grouped_gemm true \ + --moe_shared_expert_overlap true \ + --moe_aux_loss_coeff 1e-6 \ + --micro_batch_size 8 \ + --global_batch_size 16 \ + --recompute_granularity full \ + --recompute_method uniform \ + --recompute_num_layers 1 \ + --num_train_epochs 1 \ + --finetune true \ + --freeze_llm false \ + --freeze_vit true \ + --freeze_aligner true \ + --cross_entropy_loss_fusion true \ + --lr 1e-5 \ + --lr_warmup_fraction 0.05 \ + --min_lr 1e-6 \ + --output_dir megatron_output/gemma-4-26B-A4B-it \ + --eval_steps 500 \ + --save_steps 500 \ + --max_length 4096 \ + --dataloader_num_workers 8 \ + --dataset_num_proc 8 \ + --no_save_optim true \ + --no_save_rng true \ + --sequence_parallel true \ + --attention_backend unfused \ + --group_by_length true \ + --padding_free false \ + --model_author swift \ + --model_name swift-robot + +# CUDA_VISIBLE_DEVICES=0 swift infer \ +# --model megatron_output/gemma-4-26B-A4B-it/vx-xxx/checkpoint-xxx \ +# --stream true \ +# --enable_thinking false \ +# --load_data_args true \ +# --max_new_tokens 2048 diff --git a/swift/megatron/utils/convert_utils.py b/swift/megatron/utils/convert_utils.py index 4b487737a6..7b589b84ed 100644 --- a/swift/megatron/utils/convert_utils.py +++ b/swift/megatron/utils/convert_utils.py @@ -62,10 +62,13 @@ def _model_cpu_forward_context(modules, compute_device=None, share_embedding: bool = False, target_device='cpu'): - try: - origin_torch_dtype = next(modules[0].parameters()).dtype - except StopIteration: - origin_torch_dtype = next(modules[-1].parameters()).dtype + for module in modules: + try: + origin_torch_dtype = next(module.parameters()).dtype + except StopIteration: + pass + else: + break embeddings = None if share_embedding: embeddings = [module for module in modules if isinstance(module, (nn.Embedding, VocabParallelEmbedding))] @@ -77,7 +80,7 @@ def _to_cuda_hook(module, args): return args def _to_cpu_hook(module, args, output): - if share_embedding and module in embeddings: + if share_embedding and module in embeddings or 'rotaryemb' in module.__class__.__name__.lower(): return module.to(device=target_device, dtype=origin_torch_dtype)