From 1102650eda9d2bcdf2b2a571144dd85168cfdbe6 Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi Date: Mon, 8 Jul 2024 09:32:14 -0700 Subject: [PATCH 1/9] add mamba --- .../conf/fine_tuning/mamba/sft.yaml | 237 ++++++++++++++++++ launcher_scripts/nemo_launcher/core/stages.py | 3 + 2 files changed, 240 insertions(+) create mode 100644 launcher_scripts/conf/fine_tuning/mamba/sft.yaml diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml new file mode 100644 index 000000000..30cdbf1f9 --- /dev/null +++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml @@ -0,0 +1,237 @@ +run: + name: sft_mamba + results_dir: ${base_results_dir}/${fine_tuning.run.name} + time_limit: "00:45:00" + dependency: "singleton" + +trainer: + devices: 8 + accelerator: gpu + num_nodes: 1 + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: 9999 + max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 1 # frequency with which training steps are logged + val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: 1.0 + limit_val_batches: 1024 + limit_test_batches: 500 + +exp_manager: + explicit_log_dir: ${fine_tuning.run.results_dir}/results + exp_dir: null + name: ${name} + create_wandb_logger: True + wandb_logger_kwargs: + project: mamba + name: sft-test + resume_if_exists: False + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${model.data.validation_ds.metric.name} + save_top_k: 1 + mode: min + save_nemo_on_train_end: True + filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${fine_tuning.model.tensor_model_parallel_size} + always_save_nemo: False + save_best_model: True + create_early_stopping_callback: True + early_stopping_callback_params: + monitor: "val_loss" + mode: "min" + min_delta: 0.001 + patience: 10 + verbose: True + strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + + encoder_seq_length: 1024 + global_batch_size: 8 + micro_batch_size: 1 + restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: False + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + activations_checkpoint_layers_per_pipeline: null + answer_only_loss: True + gradient_as_bucket_view: False + + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + + peft: + peft_scheme: "lora" # can be either adapter,ia3, lora, or ptuning + restore_from_path: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + lora_tuning: + target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2) + adapter_dim: 32 + alpha: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + ia3_tuning: + layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + + selective_tuning: + tunable_base_param_names: ["self_attention", "word_embeddings"] # TODO: regex support @adithyre + + + data: + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_names: ??? # Path to a list of JSONL files corresponding to the source data. + global_batch_size: ${fine_tuning.model.global_batch_size} + micro_batch_size: ${fine_tuning.model.micro_batch_size} + shuffle: True + num_workers: 0 + memmap_workers: 2 + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: True + # Example of how to specify concat_sampling_probabilities + # concat_sampling_probabilities: + # - 0.5 + # - 0.25 + # - 0.25 + concat_sampling_probabilities: [1.0] # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' + label_key: 'output' + add_eos: True + add_sep: False + add_bos: True + truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + ceil_to_power_2: True + get_attention_mask_from_fusion: True + pad_to_max_length: True + validation_ds: + file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${fine_tuning.model.global_batch_size} + micro_batch_size: ${fine_tuning.model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + label_key: ${fine_tuning.model.data.train_ds.label_key} + add_eos: ${fine_tuning.model.data.train_ds.add_eos} + add_sep: ${fine_tuning.model.data.train_ds.add_sep} + add_bos: ${fine_tuning.model.data.train_ds.add_bos} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: ${fine_tuning.model.data.train_ds.truncation_field} # Options: keys in prompt_template + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + ceil_to_power_2: True + get_attention_mask_from_fusion: True + pad_to_max_length: True + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + test_ds: + file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${fine_tuning.model.global_batch_size} + micro_batch_size: ${fine_tuning.model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${fine_tuning.model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + label_key: ${fine_tuning.model.data.train_ds.label_key} + add_eos: ${fine_tuning.model.data.train_ds.add_eos} + add_sep: ${fine_tuning.model.data.train_ds.add_sep} + add_bos: ${fine_tuning.model.data.train_ds.add_bos} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: ${fine_tuning.model.data.train_ds.truncation_field} # Options: keys in prompt_template + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + ceil_to_power_2: True + get_attention_mask_from_fusion: True + pad_to_max_length: True + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + optim: + name: distributed_fused_adam + lr: 2e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 50000 + min_lr: 2e-5 diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index c6daa0239..79b1bfdaf 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -42,6 +42,7 @@ "gemma", "falcon", "baichuan2", + "mamba", "mistral", "mistral_embedding", "mixtral", @@ -1004,6 +1005,8 @@ def _get_nemo_code_path(self, model_type: str) -> Path: / "examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py", "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py", + "mamba": self._nemo_code_path + / "examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py", "mt5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py", "falcon": self._nemo_code_path From 959ac872531683d7adbca3585a1a0a539cdc3835 Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi Date: Mon, 8 Jul 2024 11:39:19 -0700 Subject: [PATCH 2/9] fix naming --- launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml index 30cdbf1f9..705436e13 100644 --- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml +++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml @@ -23,7 +23,7 @@ trainer: exp_manager: explicit_log_dir: ${fine_tuning.run.results_dir}/results exp_dir: null - name: ${name} + name: mamba create_wandb_logger: True wandb_logger_kwargs: project: mamba From ec44ff58f60689d3fc97e5382eb10c19e0c53328 Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi Date: Mon, 8 Jul 2024 11:53:44 -0700 Subject: [PATCH 3/9] fix more naming --- launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml index 705436e13..975f8aa57 100644 --- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml +++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml @@ -164,7 +164,7 @@ model: pad_to_max_length: True validation_ds: file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: null # Names of the corresponding datasets used to log metrics. + name: "squad" # Names of the corresponding datasets used to log metrics. global_batch_size: ${fine_tuning.model.global_batch_size} micro_batch_size: ${fine_tuning.model.micro_batch_size} shuffle: False From f3267f5cd5cc92dcc50e2ae5b2c850ac7865579c Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi Date: Mon, 8 Jul 2024 11:56:44 -0700 Subject: [PATCH 4/9] fix more naming --- launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml index 975f8aa57..891550584 100644 --- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml +++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml @@ -32,7 +32,7 @@ exp_manager: resume_ignore_no_checkpoint: True create_checkpoint_callback: True checkpoint_callback_params: - monitor: validation_${model.data.validation_ds.metric.name} + monitor: validation_${fine_tuning.model.data.validation_ds.metric.name} save_top_k: 1 mode: min save_nemo_on_train_end: True From 0aefd0e623c13d8a783acd5298add576f2cb8eeb Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi Date: Mon, 8 Jul 2024 12:02:51 -0700 Subject: [PATCH 5/9] fix more naming --- launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml index 891550584..60a7625bc 100644 --- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml +++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml @@ -36,7 +36,7 @@ exp_manager: save_top_k: 1 mode: min save_nemo_on_train_end: True - filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' + filename: '${fine_tuning.run.name}--{${fine_tuning.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' model_parallel_size: ${fine_tuning.model.tensor_model_parallel_size} always_save_nemo: False save_best_model: True From 0acc834d4155b6b31956172f611b608ee5a4f1e3 Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi Date: Mon, 8 Jul 2024 12:06:54 -0700 Subject: [PATCH 6/9] fix more naming --- launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml index 60a7625bc..c16039fd0 100644 --- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml +++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml @@ -169,7 +169,7 @@ model: micro_batch_size: ${fine_tuning.model.micro_batch_size} shuffle: False num_workers: 0 - memmap_workers: ${model.data.train_ds.memmap_workers} + memmap_workers: ${fine_tuning.model.data.train_ds.memmap_workers} pin_memory: True max_seq_length: 2048 min_seq_length: 1 From 1b43a8fe315803d5d8f5acf7defa1ec6491b157b Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi Date: Tue, 9 Jul 2024 08:56:04 -0700 Subject: [PATCH 7/9] set peft default to none --- launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml index c16039fd0..404a807bc 100644 --- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml +++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml @@ -85,7 +85,7 @@ model: ffn_dropout: 0.0 peft: - peft_scheme: "lora" # can be either adapter,ia3, lora, or ptuning + peft_scheme: "null" # can be either adapter,ia3, lora, or ptuning restore_from_path: null # Used for adapter peft training From 7080399984b1543644b98f19f163d1c8d01b5b18 Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi Date: Tue, 9 Jul 2024 09:11:45 -0700 Subject: [PATCH 8/9] set peft default to none --- launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml index 404a807bc..393f8b335 100644 --- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml +++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml @@ -85,7 +85,7 @@ model: ffn_dropout: 0.0 peft: - peft_scheme: "null" # can be either adapter,ia3, lora, or ptuning + peft_scheme: "none" # can be either adapter,ia3, lora, or ptuning restore_from_path: null # Used for adapter peft training From 8c7e32b7b8f9e80321cbd10ebc9f67bb22d883ec Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi Date: Tue, 9 Jul 2024 10:07:46 -0700 Subject: [PATCH 9/9] wandb to false --- launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml index 393f8b335..b3b2b4770 100644 --- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml +++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml @@ -24,7 +24,7 @@ exp_manager: explicit_log_dir: ${fine_tuning.run.results_dir}/results exp_dir: null name: mamba - create_wandb_logger: True + create_wandb_logger: False wandb_logger_kwargs: project: mamba name: sft-test